Taking a look at the dataset and checking imbalances


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# Load the dataset
data = pd.read_csv('Train_Data.csv')

# Take a look at the data
print(data.head())


print("\n\nCheck Class imbalance")
class_distribution = data['pred'].value_counts()
print(class_distribution)

  pc      ld    m0    m1    m2       m3        m4       m5        m6       m7  \
0  y   83.71  63.0   7.2  52.5  14.0232  130.8300  12.4280  188.8425   8.9520   
1  y  108.94  31.5  12.8  84.0  13.2840  128.8350  13.5256  183.0990   8.8520   
2  E  169.65   0.0   5.6  73.5  14.5472  128.9295  13.6424  174.4680   8.9800   
3  x  122.42  31.5   7.2  63.0  15.0152  119.8575  12.3344  186.8580  10.7208   
4  E  125.43  94.5   7.2  42.0  14.4176  135.4290  14.5824  187.8135   9.3088   

         m8       m9       m10      m11       m12      m13       m14   ma  \
0  201.1905   9.2896  141.9075  16.0968  150.3390  12.4880  173.1240  m78   
1  207.2385   8.4704  154.7805  13.3304  101.0205  12.5096  131.4075  m78   
2  190.3125  11.3056  156.7650      NaN  122.5350  11.7136  176.8200  m76   
3  193.8195  10.6096  175.7490      NaN  124.8030  13.8424  168.2625  m55   
4  203.1540   9.8280  172.7040  14.4720  120.2145      NaN  150.1185  m76   

   pred  
0     0  
1     0  
2     1  
3     0  


Checking data types to encode other types to int


In [15]:
data.dtypes

pc       object
ld      float64
m0      float64
m1      float64
m2      float64
m3      float64
m4      float64
m5      float64
m6      float64
m7      float64
m8      float64
m9      float64
m10     float64
m11     float64
m12     float64
m13     float64
m14     float64
ma       object
pred      int64
dtype: object

In [16]:
# Label encode categorical columns from string to int
categorical_cols = ['pc','ma']
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

After testing various imputation methods, it was observed that the best result was obtained by dropping the nan values instead


In [17]:
# Impute NaN values using KNN imputer
data = data.dropna()

As there was a severe class imbalance both over and undersampling was tested.
Under sampling provided better results and hence was finalized

In [18]:
X = data.drop('pred', axis=1)
y = data['pred']

from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

# Assuming you have your feature matrix 'X' and corresponding labels 'y'
# Create an instance of the RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Perform undersampling on the dataset
X_undersampled, y_undersampled = rus.fit_resample(X, y)

# Convert the undersampled data to a DataFrame (optional)
data = pd.DataFrame(X_undersampled, columns=X.columns)
data['pred'] = y_undersampled


print("Chekcing cclass distribution after undersampling")
class_distribution = data['pred'].value_counts()
print(class_distribution)


X = data.drop('pred', axis=1)
y = data['pred']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Chekcing cclass distribution after undersampling
0    2841
1    2841
Name: pred, dtype: int64


In [19]:
models = [
        RandomForestClassifier(),
        LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        GaussianNB(),
        MLPClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        XGBClassifier()
    ]

f1_scores = []
f1_scores_model = []

for model in models:
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      print(model)
      print(classification_report(y_test, y_pred))

RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.55      0.55      0.55       579
           1       0.53      0.53      0.53       558

    accuracy                           0.54      1137
   macro avg       0.54      0.54      0.54      1137
weighted avg       0.54      0.54      0.54      1137

LogisticRegression()
              precision    recall  f1-score   support

           0       0.57      0.61      0.59       579
           1       0.57      0.52      0.54       558

    accuracy                           0.57      1137
   macro avg       0.57      0.57      0.57      1137
weighted avg       0.57      0.57      0.57      1137



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC()
              precision    recall  f1-score   support

           0       0.57      0.59      0.58       579
           1       0.56      0.55      0.56       558

    accuracy                           0.57      1137
   macro avg       0.57      0.57      0.57      1137
weighted avg       0.57      0.57      0.57      1137

DecisionTreeClassifier()
              precision    recall  f1-score   support

           0       0.49      0.49      0.49       579
           1       0.47      0.47      0.47       558

    accuracy                           0.48      1137
   macro avg       0.48      0.48      0.48      1137
weighted avg       0.48      0.48      0.48      1137

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.53      0.52      0.52       579
           1       0.51      0.51      0.51       558

    accuracy                           0.52      1137
   macro avg       0.52      0.52      0.52      1137
weighted avg       0

After checking multiple models in decreasing order of f1-score and accuracy.
Decision Tree Classifier consistently peroformed better and hence was finalized

In [5]:
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(model)
print(accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred))

DecisionTreeClassifier()
0.4793315743183817
              precision    recall  f1-score   support

           0       0.49      0.47      0.48       579
           1       0.47      0.49      0.48       558

    accuracy                           0.48      1137
   macro avg       0.48      0.48      0.48      1137
weighted avg       0.48      0.48      0.48      1137

0.48070175438596496


After deciding on the model its hyperparameters were tuned and tested to maximize accuracy

In [20]:
from sklearn.model_selection import GridSearchCV

classifier = DecisionTreeClassifier()

# Define the hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Perform grid search
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X, y)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}


In [23]:
#Retesting the model

model = DecisionTreeClassifier(criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(model)
print(accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred))

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='sqrt',
                       min_samples_leaf=2)
0.5620052770448549
              precision    recall  f1-score   support

           0       0.61      0.38      0.47       579
           1       0.54      0.75      0.63       558

    accuracy                           0.56      1137
   macro avg       0.58      0.57      0.55      1137
weighted avg       0.58      0.56      0.55      1137

0.6283582089552239


Read and predict for the test data


In [24]:
test_data = pd.read_csv('Test_Data.csv')
test_data = test_data[X_train.columns]

After testing various imputation methods it was found that median gave the best results

In [25]:
# Preprocess the test data
test_data['pc'] = test_data['pc'].map(lambda s: label_encoder.transform([s])[0] if s in label_encoder.classes_ else -1)
test_data['ma'] = test_data['ma'].map(lambda s: label_encoder.transform([s])[0] if s in label_encoder.classes_ else -1)

#Deal with nan values
test_data = test_data.fillna(test_data.median())

Writing predictions to csv file

In [26]:
y_test_pred = model.predict(test_data)

# Prepare submission
submission = pd.DataFrame({'pred': y_test_pred})
submission = submission.astype(int)
# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

After multiple iterations of the code (as there was randomness involved due to the under sampler) a maximum score of 34.9683 was obtained.

#Thank you for your time and consideration
#Peace