In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
import pickle

In [4]:
# Load dataset
test = pd.read_csv('/Users/sanyachauhan/Desktop/my_folder/data/new_reg/test_regression.csv')
train = pd.read_csv('/Users/sanyachauhan/Desktop/my_folder/data/new_reg/train_regression.csv')

print(len(test))
print(len(train))

51000
204000


In [5]:
X_train = train.drop(["popularity"], axis=1)
X_test = test.drop(["popularity"], axis=1)

y_train = train["popularity"]
y_test = test["popularity"]

In [6]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Linear Regression

In [15]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin_reg = lin_reg.predict(X_test)

# Generalized Linear Regression (using Ridge Regression as an example)
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)

# Calculating metrics for Linear Regression
linear_regression_metrics = {
    "R2": r2_score(y_test, y_pred_lin_reg),
    "MAE": mean_absolute_error(y_test, y_pred_lin_reg)
}

# Calculating metrics for Generalized Linear Regression
generalized_linear_regression_metrics = {
    "R2": r2_score(y_test, y_pred_ridge),
    "MAE": mean_absolute_error(y_test, y_pred_ridge)
}

print("Linear Regression Metrics:", linear_regression_metrics)
print("Generalized Linear Regression Metrics:", generalized_linear_regression_metrics)



Linear Regression Metrics: {'R2': 0.37153048834385405, 'MAE': 9.533896362185061}
Generalized Linear Regression Metrics: {'R2': 0.37153030498403705, 'MAE': 9.533902061137287}


In [22]:
pickle.dump(lin_reg, open('/Users/sanyachauhan/Desktop/my_folder/models/new_reg/linear.sav', 'wb'))
pickle.dump(ridge_reg, open('/Users/sanyachauhan/Desktop/my_folder/models/new_reg/ridgeGLM.sav', 'wb'))

## GLM

In [19]:
from sklearn.linear_model import TweedieRegressor

# Create a generalized linear model
# Example: power=0 for Normal, power=1 for Poisson, power=2 for Gamma
model_glm = TweedieRegressor(power=0, alpha=1.0)

# Fit the model to the training data
model_glm.fit(X_train, y_train)

# Make predictions on the test data
y_pred_glm = model_glm.predict(X_test)

# Calculate the mean squared error for GLM
mse_glm = mean_squared_error(y_test, y_pred_glm)

# Calculate R squared value for GLM
r2_glm = r2_score(y_test, y_pred_glm)
print(r2_glm)

print(mean_absolute_error(y_test, y_pred_ridge))


0.2836993101284494
9.533902061137287


In [23]:
pickle.dump(model_glm, open('/Users/sanyachauhan/Desktop/my_folder/models/new_reg/glm.sav', 'wb'))

## k-Nearest Neighbours - Regression

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score


# Create KNN Regressor
knn = KNeighborsRegressor()

# Define a range of k values to test
param_grid = {'n_neighbors': range(1, 30)}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Best parameter and score
best_k = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_

print("Best K:", best_k)
print("Best Score (Negative MAE):", best_score)

# Using the best K value to predict on test data
best_knn = KNeighborsRegressor(n_neighbors=best_k)
best_knn.fit(X_train, y_train)
y_pred_best_knn = best_knn.predict(X_test)

# Calculating metrics for KNN Regression with best K
best_knn_metrics = {
    "R2": r2_score(y_test, y_pred_best_knn),
    "MAE": mean_absolute_error(y_test, y_pred_best_knn)
}

print("K-Nearest Neighbors Regression Metrics with Best K:", best_knn_metrics)


Best K: 23
Best Score (Negative MAE): -9.38358567774936
K-Nearest Neighbors Regression Metrics with Best K: {'R2': 0.3800011471907332, 'MAE': 9.392740835464622}


In [21]:
pickle.dump(best_knn, open('/Users/sanyachauhan/Desktop/my_folder/models/new_reg/knnreg_bestk23.sav', 'wb'))