In [32]:
import json
import pandas as pd

# Define the path to the JSON file
file_path = "train.json"
json_data = []

df = pd.read_json(file_path, lines=True)
df['target'] = df['target'].apply(lambda x: x[1])
print(df.head())

       0.0      36.6      36.7      38.0      38.2  38.300000000000004  \
0   3.0074   81.2963   83.7169  119.6888  115.8881            413.4768   
1  13.6523  361.5547  119.7225  170.9554   84.0531            273.8909   
2   7.1155   80.2869  169.7490  149.4092  132.8215            198.8544   
3   2.0998  235.5117  161.2402  241.3578  243.2237             89.7297   
4  16.2152  384.2739  174.5714   84.3117  137.9726            261.8213   

       39.0      39.1      39.2       39.6  ...    466.0    523.0    523.1  \
0  136.8640  817.5328   86.4743   477.6116  ...  15.8853   1.5231   4.9432   
1  175.9083  665.0791  588.2624    84.1962  ...   6.9459  26.9344   9.0805   
2  150.5273  112.4005  335.4482   675.7621  ...   9.3406  18.8055   7.0021   
3  640.7394  121.9373  396.3036  1138.3905  ...  26.2306   8.1640  26.6106   
4  365.8102   96.3555  219.9515   211.2868  ...  39.9084   4.7010  50.1098   

     523.4     524.4    524.5     554.2     554.4    1046.0  target  
0   9.2590   20.

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [48]:
X = df.drop(columns=['target'])
y = df['target']

print(y)

0       major
1       major
2       major
3       major
4       major
        ...  
9842    major
9843    major
9844    minor
9845    major
9846    minor
Name: target, Length: 9847, dtype: object


In [49]:
#transform y into binary array 

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y)

[0 0 0 ... 1 0 1]


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create classifier with radial basis function (rbf) for non linear decision boundaries
#gamma is set to scale to automatically 
svm_classifier = SVC(kernel='rbf', gamma='scale')
svm_classifier.fit(X_train, y_train)

In [63]:
# this SVM model is outputs all predictions as major, of the dominant class. I am performing grid search to adjust
# hyperparameters in the following cell.
y_pred = svm_classifier.predict(X_test)
print(y_pred)
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

[0 0 0 ... 0 0 0]
              precision    recall  f1-score   support

       major       0.64      1.00      0.78      1263
       minor       1.00      0.00      0.00       707

    accuracy                           0.64      1970
   macro avg       0.82      0.50      0.39      1970
weighted avg       0.77      0.64      0.50      1970



In [None]:
#UNDER CONSTRUCTION

#the following three cells attempt to tweak above svm by changing the hyperparam in hopes of
#improving decision boundary
#tweaking hyperparameters to determine best fits
param_grid = {
    'C': [0.1, 1, 10, 100],  #range for regularization hyperparam
    'gamma': ['scale', 'auto', 0.1, 0.01], #range for tests of gamma for rbf
}

#tqdm displays progress bar with long commands
from tqdm import tqdm 
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5)


In [71]:
with tqdm(total=len(param_grid['C']) * len(param_grid['gamma']), desc="Grid Search Progress") as pbar:
    grid_search.fit(X_train, y_train)
    pbar.update()

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Grid Search Progress:   0%|                              | 0/16 [00:21<?, ?it/s]


KeyboardInterrupt: 

In [72]:
y_pred = svm_classifier.predict(X_test)
# print(y_pred)
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))


## UNDER CONSTRUCTION

KeyboardInterrupt: 

In [67]:
#this cell creates basic logistic regression model using scikit learn
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(max_iter=3000) #increasing max iteration in hopes of convergance

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Evaluate the model
y_pred = logistic_regression_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.80      0.74      1263
           1       0.50      0.35      0.41       707

    accuracy                           0.64      1970
   macro avg       0.60      0.58      0.58      1970
weighted avg       0.62      0.64      0.62      1970



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
#this cell creates 100 decision trees to create a random forest ensemble ML model

from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)
y_pred = random_forest_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.97      0.79      1263
           1       0.71      0.14      0.23       707

    accuracy                           0.67      1970
   macro avg       0.69      0.55      0.51      1970
weighted avg       0.68      0.67      0.59      1970

