In [31]:
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression


In [32]:
test_df = pd.read_csv("quality_test.csv")
train_df = pd.read_csv("quality_train.csv")

print(test_df.shape)
print(test_df.head())
print(train_df.shape)

(20000, 3)
   category   popular     genre
0         1 -0.811510  0.748751
1         2 -0.091522 -0.889338
2         2  0.339104 -0.683431
3         1 -0.254350  0.820988
4         2 -0.531400 -1.983743
(80000, 3)


In [33]:
train_df['category'] = train_df['category'] - 1
#train_df['category'] = train_df['category'].astype('category')

test_df['category'] = test_df['category'] - 1
#test_df['category'] = test_df['category'].astype('category')


The model equation is:

logit(𝑃(Y=k)) = $𝛽_{0,k}$ + $𝛽_{1,k}$ * popular + $𝛽_{2,k}$ * genre 

where Y is the response variable (category), 𝑃(Y=k) is the probability of category k, and  $𝛽_{0,k}$, $𝛽_{1,k}$, $𝛽_{2,k}$ are the model coefficients for each category.


In [34]:
# Separate features and target variable
X_train = train_df[['popular', 'genre']]
y_train = train_df['category']
X_test = test_df[['popular', 'genre']]
y_test = test_df['category']

In [69]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=10)

# fit the model with data
res = logreg.fit(X_train, y_train)

predProbs = res.predict_proba(X_train)



print(logreg.coef_)


[[ 3.54027342 -5.32328568]]


In [75]:
import statsmodels.api as sm 

log_reg = sm.Logit(y_train, X_train).fit() 

print(log_reg.summary()) 


Optimization terminated successfully.
         Current function value: 0.253422
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               category   No. Observations:                80000
Model:                          Logit   Df Residuals:                    79998
Method:                           MLE   Df Model:                            1
Date:                Wed, 23 Oct 2024   Pseudo R-squ.:                  0.6342
Time:                        10:52:09   Log-Likelihood:                -20274.
converged:                       True   LL-Null:                       -55422.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
popular        3.3744      0.034     98.944      0.000       3.308       3.441
genre         -5.3555      0.

In [54]:
# import the metrics class
from sklearn import metrics

y_pred = res.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix



array([[8536, 1232],
       [1176, 9056]], dtype=int64)

$C_{0,0}$, false negatives is $C_{1,0}$, true positives is $C_{1,1}$ and false positives is $C_{0,1}$

In [64]:
from sklearn.metrics import classification_report
#target_names = ['without diabetes', 'with diabetes']
conf_matrix = classification_report(y_test, y_pred)
print(conf_matrix)#, target_names=target_names))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      9768
           1       0.88      0.89      0.88     10232

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000



In [66]:
from sklearn.metrics import confusion_matrix, accuracy_score


tn, fp, fn, tp = cnf_matrix.ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate TPR (Sensitivity)
tpr = tp / (tp + fn)

# Calculate FPR
fpr = fp / (fp + tn)

# Calculate Specificity
specificity = tn / (tn + fp)

# Display the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"True Positive Rate (Sensitivity): {tpr:.2f}")
print(f"False Positive Rate: {fpr:.2f}")
print(f"Specificity: {specificity:.2f}")

Accuracy: 0.88
True Positive Rate (Sensitivity): 0.89
False Positive Rate: 0.13
Specificity: 0.87


In [77]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predict the categories on the test data
y_pred = lda.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pred)

# Evaluate the model
print("Confusion Matrix:\n",cnf_matrix )
print("\nClassification Report:\n", classification_report(y_test, y_pred))


tn, fp, fn, tp = cnf_matrix.ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate TPR (Sensitivity)
tpr = tp / (tp + fn)

# Calculate FPR
fpr = fp / (fp + tn)

# Calculate Specificity
specificity = tn / (tn + fp)

# Display the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"True Positive Rate (Sensitivity): {tpr:.2f}")
print(f"False Positive Rate: {fpr:.2f}")
print(f"Specificity: {specificity:.2f}")

Confusion Matrix:
 [[8214 1554]
 [ 924 9308]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.84      0.87      9768
           1       0.86      0.91      0.88     10232

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000

Accuracy: 0.88
True Positive Rate (Sensitivity): 0.91
False Positive Rate: 0.16
Specificity: 0.84


In [78]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Fit the QDA model
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Predict the categories on the test data
y_pred = qda.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pred)

# Evaluate the model
print("Confusion Matrix:\n",cnf_matrix )
print("\nClassification Report:\n", classification_report(y_test, y_pred))


tn, fp, fn, tp = cnf_matrix.ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate TPR (Sensitivity)
tpr = tp / (tp + fn)

# Calculate FPR
fpr = fp / (fp + tn)

# Calculate Specificity
specificity = tn / (tn + fp)

# Display the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"True Positive Rate (Sensitivity): {tpr:.2f}")
print(f"False Positive Rate: {fpr:.2f}")
print(f"Specificity: {specificity:.2f}")

Confusion Matrix:
 [[8288 1480]
 [ 999 9233]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.85      0.87      9768
           1       0.86      0.90      0.88     10232

    accuracy                           0.88     20000
   macro avg       0.88      0.88      0.88     20000
weighted avg       0.88      0.88      0.88     20000

Accuracy: 0.88
True Positive Rate (Sensitivity): 0.90
False Positive Rate: 0.15
Specificity: 0.85


In [79]:
from sklearn.neighbors import KNeighborsClassifier
k_values = [1, 5, 10, 50]

for k in k_values: 
    # Fit the KNN model 
    knn = KNeighborsClassifier(n_neighbors=k) 
    knn.fit(X_train, y_train)

    print(f"\nResults for K={k}:")
    # Predict the categories on the test data
    y_pred = knn.predict(X_test)

    cnf_matrix = confusion_matrix(y_test, y_pred)

    # Evaluate the model
    print("Confusion Matrix:\n",cnf_matrix )
    print("\nClassification Report:\n", classification_report(y_test, y_pred))


    tn, fp, fn, tp = cnf_matrix.ravel()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate TPR (Sensitivity)
    tpr = tp / (tp + fn)

    # Calculate FPR
    fpr = fp / (fp + tn)

    # Calculate Specificity
    specificity = tn / (tn + fp)

    # Display the metrics
    print(f"Accuracy: {accuracy:.2f}")
    print(f"True Positive Rate (Sensitivity): {tpr:.2f}")
    print(f"False Positive Rate: {fpr:.2f}")
    print(f"Specificity: {specificity:.2f}")


Results for K=1:
Confusion Matrix:
 [[8131 1637]
 [1647 8585]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83      9768
           1       0.84      0.84      0.84     10232

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

Accuracy: 0.84
True Positive Rate (Sensitivity): 0.84
False Positive Rate: 0.17
Specificity: 0.83

Results for K=5:
Confusion Matrix:
 [[8419 1349]
 [1339 8893]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      9768
           1       0.87      0.87      0.87     10232

    accuracy                           0.87     20000
   macro avg       0.87      0.87      0.87     20000
weighted avg       0.87      0.87      0.87     20000

Accuracy: 0.87
True Positive Rate (Sensitivity): 0.87
False Positive R

In [80]:
from sklearn.naive_bayes import GaussianNB

# Fit the Gaussian Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict the categories on the test data
y_pred = nb.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pred)

# Evaluate the model
print("Confusion Matrix:\n",cnf_matrix )
print("\nClassification Report:\n", classification_report(y_test, y_pred))


tn, fp, fn, tp = cnf_matrix.ravel()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate TPR (Sensitivity)
tpr = tp / (tp + fn)

# Calculate FPR
fpr = fp / (fp + tn)

# Calculate Specificity
specificity = tn / (tn + fp)

# Display the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"True Positive Rate (Sensitivity): {tpr:.2f}")
print(f"False Positive Rate: {fpr:.2f}")
print(f"Specificity: {specificity:.2f}")

Confusion Matrix:
 [[8080 1688]
 [1086 9146]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.85      9768
           1       0.84      0.89      0.87     10232

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

Accuracy: 0.86
True Positive Rate (Sensitivity): 0.89
False Positive Rate: 0.17
Specificity: 0.83


Logistic Regression:

Accuracy: 0.88
Sensitivity: 0.89
Specificity: 0.87
Low bias, moderate variance

Linear Discriminant Analysis (LDA):

Accuracy: 0.88
Sensitivity: 0.91
Specificity: 0.84
Moderate bias, lower variance due to a more rigid decision boundary assumption.

Quadratic Discriminant Analysis (QDA):

Accuracy: 0.88
Sensitivity: 0.90
Specificity: 0.85
Lower bias, higher variance due to flexible decision boundaries.

K-Nearest Neighbors (KNN):

K=1: Accuracy: 0.84, Sensitivity: 0.84, Specificity: 0.83 (High variance, low bias)
K=5: Accuracy: 0.87, Sensitivity: 0.87, Specificity: 0.86 (Reduced variance, slightly increased bias)
K=10: Accuracy: 0.87, Sensitivity: 0.85, Specificity: 0.89 (Good balance of bias-variance)
K=50: Accuracy: 0.88, Sensitivity: 0.87, Specificity: 0.88 (Low variance, higher bias)
Naive Bayes:

Accuracy: 0.86
Sensitivity: 0.89
Specificity: 0.83
High bias, low variance due to independence assumption

Logistic Regression, LDA, and KNN with K=50 perform similarly well with Logistic regression being the best