# Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load your CSV file
df = pd.read_csv("dataset/data2.csv")

# Assuming 'label' is the label column
sentences = df['sentence']
individuals = df['individual']

# Extracting features (excluding the label column)
features = df.drop(columns=['sentence', 'individual'])
random_columns = np.random.choice(features.columns, size=8, replace=False)

# Select the columns
selected_features = df[random_columns]

# Get unique labels and assign colors
unique_sentences = sentences.unique()
sentence_color = sns.color_palette("husl", len(unique_sentences))

# Get unique labels and assign colors
unique_individual = individuals.unique()
individual_color = sns.color_palette("husl", len(unique_individual))

# Plot each feature in a separate graph
num_features = len(selected_features.columns)


In [None]:
# Set up subplots for features
fig, axes = plt.subplots(num_features, 2, figsize=(12, 4 * num_features))

# Iterate through features
for i, feature in enumerate(selected_features.columns):
    # Plot feature as line with different colors for each label
    for sentence, color in zip(unique_sentences, sentence_color):
        subset = df[df['sentence'] == sentence]
        axes[i, 0].plot(subset.index, subset[feature], label=f'{sentence}', color=color)

    axes[i, 0].set_ylabel(feature)
    axes[i, 0].legend()

    # Plot histogram for labels with different colors
    axes[i, 1].hist([sentences[sentences == label] for label in unique_sentences], bins='auto', edgecolor='black', label=unique_sentences, color=sentence_color)
    axes[i, 1].set_title('Label Histogram')
    axes[i, 1].legend()

# Add a common X-axis label for features
axes[-1, 0].set_xlabel('Index')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()

# Remove Outliers

In [None]:
import pandas as pd

def remove_outliers(df, factor=1.5):
    """
    Remove outliers from each column in a DataFrame using the IQR method.

    Parameters:
    - df: DataFrame
    - factor: Float, multiplier for IQR. Values outside (Q1 - factor * IQR, Q3 + factor * IQR) are considered outliers.

    Returns:
    - DataFrame without outliers.
    """
    df_no_outliers = pd.DataFrame()

    for column in df.columns:
        # Calculate the quartiles
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)

        # Calculate the IQR (Interquartile Range)
        IQR = Q3 - Q1

        # Define the outlier range
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR

        # Filter values within the range (non-outliers)
        filtered_values = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)][column]

        # Concatenate the column without outliers to the new DataFrame
        df_no_outliers = pd.concat([df_no_outliers, filtered_values], axis=1)

    return df_no_outliers

features = remove_outliers(features)


In [None]:
features.dropna(axis=0, inplace=True, thresh=280)
features.head

# Imports

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics
import glob
from scipy import signal
import seaborn as sn
from sklearn.metrics import classification_report
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Reading Data

In [None]:
# Load your CSV file
df = pd.read_csv("dataset/data2.csv")

x_df = df.iloc[:, :304]
y_df = df.iloc[:, 304]
ind_df = df.iloc[:, 305]

x_data=x_df.values
y_data=y_df.values
ind_data=ind_df.values

print(x_data.shape)
print(y_data.shape)
print(ind_data.shape)

In [None]:
Xsc = StandardScaler().fit_transform(x_data)
Xsc=pd.DataFrame(Xsc)
y=pd.DataFrame(y_data)
Xsc.head()

In [None]:
remove_outliers(Xsc)
Xsc.shape

# Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming Xsc and y are defined earlier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Xsc, y, test_size=0.2, random_state=0)

# Create a KNN classifier with balanced class weights
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')  # You can adjust n_neighbors and weights based on your needs

# Fit the classifier to the training set
knn.fit(X_train, y_train)

# Make predictions on the test set - unseen data
y_pred = knn.predict(X_test)
print(y_pred)
print(y_test)
# Print accuracy score
print(f'Accuracy score: {accuracy_score(y_test, y_pred):0.4f}')

# Print confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)


# Results

In [None]:
plt.figure(figsize=(4,3))
plt.title('Confusion Matrix')
sn.heatmap(cf_matrix, annot=True, fmt=' ', cmap='Reds')
plt.show()

print(classification_report(y_test, y_pred))

# Saving the Model

In [None]:
joblib.dump(knn, 'svm_model.pkl')