In [46]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../raw_data/merged_labeled_data.csv')

In [3]:
y = df.pop('label')
X = df[['nunique_values','n_values','mean','std','median','skew','kurt','shapiro_wilk_test']]

In [4]:
X = X.fillna(0).replace('inf', 0)

In [5]:
X.head(1)

Unnamed: 0,nunique_values,n_values,mean,std,median,skew,kurt,shapiro_wilk_test
0,91.0,303.0,149.6468647,22.905161,153.0,-0.53741,-0.06197,6.6e-05


In [6]:
incorrect_mean = X['mean'].str.contains(':', na=False)
X.loc[incorrect_mean, 'mean'] = 0

In [7]:
X['mean'] = X['mean'].astype(str).astype(float)

In [40]:
array_sum = np.sum(X['mean'])
np.isnan(array_sum)

False

In [97]:
#pd.DataFrame(y).value_counts(normalize = True)

In [9]:
l_encoder = LabelEncoder()
y_encoded = l_encoder.fit_transform(y)

In [68]:
labels_df = pd.concat([pd.DataFrame(y),pd.DataFrame(y_encoded)], axis=1).drop_duplicates()

In [69]:
labels_df

Unnamed: 0,label,0
0,int,4
1,float,3
2,cat-multi,1
11,other,5
12,text,6
21,date,2
26,cat-binary,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y_encoded,test_size=0.1, random_state=42)

In [71]:
pd.DataFrame(y_train).value_counts()

4    946
1    441
5    367
3    271
2     80
0     73
6     72
dtype: int64

In [95]:
strategy = {0:300,1:441,2:300,3:300,4:946,5:367,6:300}

In [96]:
oversample = SMOTE(sampling_strategy=strategy)
X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)



In [87]:
pd.DataFrame(y_resampled).value_counts()

4    946
1    441
5    367
0    300
2    300
3    300
6    300
dtype: int64

In [47]:
X_resampled['mean']

0      -1.086621
1       0.235638
2       0.000000
3       0.000000
4       0.000000
          ...   
6617    0.000000
6618    0.000000
6619    0.000000
6620    0.000000
6621    0.000000
Name: mean, Length: 6622, dtype: float64

In [76]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [77]:
X_train_scaled[2].std()

0.4273157551630806

In [78]:
neigh_model = KNeighborsClassifier()
neigh_model.fit(X_train_scaled,y_resampled)

KNeighborsClassifier()

In [79]:
neigh_model.score(X_test_scaled, y_test)

0.692

In [80]:
y_pred = neigh_model.predict(X_test_scaled)

In [56]:
labels = ['int', 'cat-multi', 'float', 'other', 'text', 'cat-binary', 'date']


In [81]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.86      0.57         7
           1       0.64      0.59      0.62        54
           2       0.06      0.20      0.09         5
           3       0.67      0.52      0.59        23
           4       0.91      0.87      0.89       105
           5       0.70      0.56      0.62        50
           6       0.27      0.50      0.35         6

    accuracy                           0.69       250
   macro avg       0.53      0.59      0.53       250
weighted avg       0.74      0.69      0.71       250

