In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
# Load dataset
test = pd.read_csv('data/new/test.csv')
train = pd.read_csv('data/new/train.csv')

print(len(test))
print(len(train))

24000
96000


In [4]:
X_train = train.drop(["popularity"], axis=1)
X_test = test.drop(["popularity"], axis=1)

y_train = train["popularity"]
y_test = test["popularity"]

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression with Ridge Regularization

In [11]:
# Create a logistic regression model with L2 regularization (ridge)
model_log_ridge = LogisticRegression(penalty='l2', C=1.0)

# Fit the model to the training data
model_log_ridge.fit(X_train, y_train)

# Make predictions on the test data
y_pred_ridge = model_log_ridge.predict(X_test)

# Calculate Accuracy
accuracy_log_ridge = accuracy_score(y_test, y_pred_ridge)

# Calculate f1 score
f1_log_ridge = f1 = f1_score(y_test, y_pred_ridge, average='macro')

# Calculate the mean absolute error
mae_log_ridge = mean_absolute_error(y_test, y_pred_ridge)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Logistic Regression with LASSO

In [12]:
# Create a logistic regression model with L1 regularization (Lasso)
model_log_LASSO = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')

# Fit the model to the training data
model_log_LASSO.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lasso = model_log_LASSO.predict(X_test)

# Calculate Accuracy
accuracy_log_LASSO = accuracy_score(y_test, y_pred_lasso)

# Calculate f1 score
f1_log_LASSO = f1_score(y_test, y_pred_lasso, average='macro')

# Calculate the mean absolute error
mae_log_LASSO = mean_absolute_error(y_test, y_pred_lasso)

## k-Nearest Neighbours

In [17]:
# Define a range of k values to consider
k_values = list(range(3, 30, 3)) 

# Create a kNN model
knn = KNeighborsClassifier()

best_k = None
best_score = 0

# Perform cross-validation for each k value
for k in k_values:
    knn.n_neighbors = k  # Set the k value
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    mean_score = scores.mean()

    print(k)
    
    if mean_score > best_score:
        best_score = mean_score
        best_k = k

print("Best k value:", best_k)

3
6
9
12
15
18
21
24
27
Best k value: 27


In [19]:
# Train the final model with the best k value
final_knn = KNeighborsClassifier(n_neighbors=best_k)
final_knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred_knn = final_knn.predict(X_test)

# Calculate Accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Calculate f1 score
f1_knn = f1_score(y_test, y_pred_knn, average='macro')

# Calculate the mean absolute error
mae_knn = mean_absolute_error(y_test, y_pred_knn)

In [26]:
print("\033[1mAccuracy Comparison\033[0m")
print("Logistic Regression with Ridge Regularization:", accuracy_log_ridge)
print("Logistic Regression with LASSO:", accuracy_log_LASSO)
print("k nearest Neighbours with k=27:", accuracy_knn)

print("\n\033[1mMean Absolute Error Comparison\033[0m")
print("Logistic Regression with Ridge Regularization:", mae_log_ridge)
print("Logistic Regression with LASSO:", mae_log_LASSO)
print("k nearest Neighbours with k=27:", mae_knn)

print("\n\033[1mF1 Score Comparison\033[0m")
print("Logistic Regression with Ridge Regularization:", f1_log_ridge)
print("Logistic Regression with LASSO:", f1_log_LASSO)
print("k nearest Neighbours with k=27:", f1_knn)

[1mAccuracy Comparison[0m
Logistic Regression with Ridge Regularization: 0.20995833333333333
Logistic Regression with LASSO: 0.20820833333333333
k nearest Neighbours with k=27: 0.19495833333333334

[1mMean Absolute Error Comparison[0m
Logistic Regression with Ridge Regularization: 2.371208333333333
Logistic Regression with LASSO: 2.391875
k nearest Neighbours with k=27: 2.32725

[1mF1 Score Comparison[0m
Logistic Regression with Ridge Regularization: 0.18345070158125412
Logistic Regression with LASSO: 0.18038430382932585
k nearest Neighbours with k=27: 0.1831935665906388


In [27]:
pickle.dump(model_log_ridge, open('models/log_reg_ridge.sav', 'wb'))
pickle.dump(model_log_LASSO, open('models/log_reg_lasso.sav', 'wb'))
pickle.dump(final_knn, open('models/knn_bestk_27.sav', 'wb'))