In [22]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [5]:
Train = pd.read_csv("Train_Set.csv")
Test = pd.read_csv("Test_Set.csv")

Train.drop(['Unnamed: 0'], axis=1, inplace=True)
Test.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
int64_cols = Train.shape[1] - 6
cols_convert = Train.columns[:int64_cols]
dtype_dict = {col: np.int8 for col in cols_convert}
Train[cols_convert] = Train[cols_convert].astype(dtype_dict)

int64_cols = Test.shape[1] - 6
cols_convert = Test.columns[:int64_cols]
dtype_dict = {col: np.int8 for col in cols_convert}
Test[cols_convert] = Test[cols_convert].astype(dtype_dict)

In [7]:
label_cols = ["label", "binned_label"]

X_Train = Train.drop(columns=label_cols)
y_binned_Train = Train['binned_label']
y_Train = Train['label']

X_Test = Test.drop(columns=label_cols)
y_binned_Test = Test['binned_label']
y_Test = Test['label']

In [8]:
KF = KFold(n_splits=5)
def Model_Train(Model, X, Y):
    Preds = []
    for i,j in KF.split(X):
        Train_X = X.iloc[i]
        Test_X = X.iloc[j]
        Train_Y = Y.iloc[i]
        
        Model.fit(Train_X, Train_Y)
        Pred_Y = Model.predict(Test_X)
        Preds.append(Pred_Y)

## Classification:

## KNN

In [27]:
Range = list(range(1, 6))
Param_grid = dict(n_neighbors=Range)    
KNN = KNeighborsClassifier()    
Grid = GridSearchCV(KNN, Param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=2)
Grid.fit(X_Train, y_binned_Train)
best_k = Grid.best_params_['n_neighbors']
best_model = Grid.best_estimator_

y_pred = best_model.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.99      0.99      0.99     23365
    188k - 373k       0.98      0.99      0.99     58034
       1k - 89k       0.99      1.00      1.00     23646
    374k - 699k       0.99      0.99      0.99     58602
   700k - 1424k       0.99      0.99      0.99     35088
     90k - 187k       0.99      0.98      0.99     35385

       accuracy                           0.99    234120
      macro avg       0.99      0.99      0.99    234120
   weighted avg       0.99      0.99      0.99    234120


## Naive Bayes

In [43]:
NB = GaussianNB()
Model_Train(NB, X_Train, y_binned_Train)

y_pred = NB.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.20      0.95      0.33     23365
    188k - 373k       0.55      0.04      0.08     58034
       1k - 89k       0.23      0.83      0.36     23646
    374k - 699k       0.63      0.14      0.23     58602
   700k - 1424k       0.24      0.09      0.13     35088
     90k - 187k       0.26      0.07      0.11     35385

       accuracy                           0.25    234120
      macro avg       0.35      0.35      0.21    234120
   weighted avg       0.41      0.25      0.18    234120


## CART

In [46]:
Cart = DecisionTreeClassifier()
Model_Train(Cart, X_Train, y_binned_Train)

y_pred = Cart.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.99      0.99      0.99     23365
    188k - 373k       0.98      0.99      0.99     58034
       1k - 89k       1.00      1.00      1.00     23646
    374k - 699k       0.99      0.99      0.99     58602
   700k - 1424k       0.99      0.99      0.99     35088
     90k - 187k       0.99      0.98      0.99     35385

       accuracy                           0.99    234120
      macro avg       0.99      0.99      0.99    234120
   weighted avg       0.99      0.99      0.99    234120


## Random Forest

In [48]:
RF = RandomForestClassifier(n_estimators=10, random_state=42)
Model_Train(RF, X_Train, y_binned_Train)

y_pred = RF.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.99      0.99      0.99     23365
    188k - 373k       0.98      0.99      0.99     58034
       1k - 89k       1.00      1.00      1.00     23646
    374k - 699k       0.99      0.99      0.99     58602
   700k - 1424k       0.99      0.99      0.99     35088
     90k - 187k       1.00      0.98      0.99     35385

       accuracy                           0.99    234120
      macro avg       0.99      0.99      0.99    234120
   weighted avg       0.99      0.99      0.99    234120


## Logistic Regression

In [49]:
LogR = OneVsRestClassifier(LogisticRegression(solver='liblinear',  max_iter=10000))
Model_Train(LogR, X_Train, y_binned_Train)

y_pred = LogR.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.70      0.78      0.74     23365
    188k - 373k       0.50      0.64      0.56     58034
       1k - 89k       0.62      0.55      0.58     23646
    374k - 699k       0.54      0.53      0.54     58602
   700k - 1424k       0.53      0.43      0.48     35088
     90k - 187k       0.51      0.39      0.45     35385

       accuracy                           0.55    234120
      macro avg       0.57      0.55      0.56    234120
   weighted avg       0.55      0.55      0.55    234120


## SVM

In [63]:
SVM = LinearSVC(max_iter=5000, dual= False)
Model_Train(SVM, X_Train, y_binned_Train)

y_pred = SVM.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       0.65      0.80      0.72     23365
    188k - 373k       0.49      0.62      0.55     58034
       1k - 89k       0.58      0.57      0.58     23646
    374k - 699k       0.55      0.49      0.52     58602
   700k - 1424k       0.51      0.41      0.46     35088
     90k - 187k       0.50      0.38      0.43     35385

       accuracy                           0.54    234120
      macro avg       0.55      0.55      0.54    234120
   weighted avg       0.53      0.54      0.53    234120


## Neural Network

In [9]:
encoder = LabelEncoder()
y_binned_encoded = pd.Series(encoder.fit_transform(y_binned_Train))
early_stopping = EarlyStopping(monitor='loss', patience=5)

In [10]:
nn = Sequential()
nn.add(Dense(128, activation='relu'))
nn.add(Dropout(0.2)) 
nn.add(Dense(64, activation='relu'))
nn.add(Dropout(0.2)) 
nn.add(Dense(6, activation='softmax'))

nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
for i,j in KF.split(X_Train):
        Train_X = X_Train.iloc[i]
        Val_X = X_Train.iloc[j]
        Train_Y = y_binned_encoded.iloc[i]
        
        nn.fit(Train_X, Train_Y, epochs=200, batch_size=256, callbacks=[early_stopping])

In [13]:
y_pred = nn.predict(X_Test)
y_pred = np.argmax(y_pred, axis=1)
y_binned_encoded_test = pd.Series(encoder.fit_transform(y_binned_Test))

print("Metrics: \n", classification_report(y_binned_encoded_test, y_pred))

[1m7317/7317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
Metrics: 
               precision    recall  f1-score   support

           0       0.91      0.91      0.91     23365
           1       0.82      0.87      0.85     58034
           2       0.87      0.83      0.85     23646
           3       0.86      0.86      0.86     58602
           4       0.84      0.83      0.84     35088
           5       0.82      0.79      0.80     35385

    accuracy                           0.85    234120
   macro avg       0.85      0.85      0.85    234120
weighted avg       0.85      0.85      0.85    234120


## Regression

## Linear Regression

In [16]:
LR = LinearRegression()
Model_Train(LR, X_Train, y_Train)

y_pred = LR.predict(X_Test)
print(f"R-squared: {r2_score(y_Test, y_pred):.4f}")

R-squared: 0.2070


## CART

In [18]:
CartReg = DecisionTreeRegressor(ccp_alpha=0.01, criterion='squared_error') 
Model_Train(CartReg, X_Train, y_Train)

y_pred = CartReg.predict(X_Test)
print(f"R-squared: {r2_score(y_Test, y_pred):.4f}")

R-squared: 0.9798


## Random Forrest

In [20]:
RFReg = RandomForestRegressor(n_estimators=10, random_state=42)
Model_Train(RFReg, X_Train, y_Train)

y_pred = RFReg.predict(X_Test)
print(f"R-squared: {r2_score(y_Test, y_pred):.4f}")

R-squared: 0.9758


## SVM

In [23]:
SVMReg = LinearSVR(dual = 'auto')
Model_Train(SVMReg, X_Train, y_Train)

y_pred = SVMReg.predict(X_Test)
print(f"R-squared: {r2_score(y_Test, y_pred):.4f}")

R-squared: 0.0376


## Neural Network

In [24]:
nnReg = Sequential()
nnReg.add(Dense(128, activation='relu'))
nnReg.add(Dropout(0.2)) 
nnReg.add(Dense(64, activation='relu'))
nnReg.add(Dropout(0.2)) 
nnReg.add(Dense(1))

nnReg.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

In [None]:
for i,j in KF.split(X_Train):
        Train_X = X_Train.iloc[i]
        Val_X = X_Train.iloc[j]
        Train_Y = y_Train.iloc[i]
        
        nnReg.fit(Train_X, Train_Y, epochs=200, batch_size=256, callbacks=[early_stopping])

In [27]:
y_pred = nnReg.predict(X_Test)
print(f"R-squared: {r2_score(y_Test, y_pred):.4f}")

[1m7317/7317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
R-squared: 0.2911
