<a href="https://colab.research.google.com/github/patela3/Data-Projects/blob/main/cmsc320hw3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, classification_report, r2_score

from google.colab import drive
drive.mount("/content/drive")
base = "/content/drive/MyDrive/merged.csv"
df = pd.read_csv(base)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#some basic cleaning
df = df.rename(columns = {'What grade do you think you got?' : 'Letter Grade',
                          'Total Score': 'numscore'})
#drop unusable rows
df = df.dropna(subset=['Letter Grade']).copy()
df = df[df['Letter Grade'] != 'Prefer not to say'].copy()

#binning
grade_pf = {
    'A': 'pass',
    'B': 'pass',
    'C': 'pass',
    'D': 'fail',
    'F': 'fail'
}
df['Pass/Fail'] = df['Letter Grade'].map(grade_pf)

#target for this part:
y_pf = df['Pass/Fail']
droppable = [
    'Timestamp',
    'Max Points',
    'I wanted the extra credit but just put down random responses (you\'ll still get the extra credit if you say yes)',
    'Letter Grade',
    'Pass/Fail',
    'numscore',
    'Which section are you in?'
]
X = df.drop(columns=droppable, errors='ignore')

cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns
num_pl = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pl = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocess = ColumnTransformer([
    ('cat', cat_pl, cat_cols),
    ('num', num_pl, num_cols)
])

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}
X_train, X_test, y_train, y_test = train_test_split(X, y_pf, test_size=0.2, random_state=42, stratify=y_pf)

print("Part 1: Pass vs Fail Classification")
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_test, pred))
    print("F1 Score:", f1_score(y_test, pred, pos_label='pass', zero_division=0))

Part 1: Pass vs Fail Classification

Logistic Regression
Accuracy: 0.9166666666666666
F1 Score: 0.9552238805970149

Random Forest
Accuracy: 0.8888888888888888
F1 Score: 0.9411764705882353

SVM
Accuracy: 0.8888888888888888
F1 Score: 0.9411764705882353


In [None]:
#part 2
#target for this part:
y_grade = df['Letter Grade']
X_train, X_test, y_train, y_test = train_test_split(X, y_grade, test_size=0.2, random_state=42, stratify=y_grade)
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

print("Part 2: Classification for Letter Grade")
for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    print(f"\n{name}")
    print("\nClassification Report:")
    labels_order = ['A', 'B', 'C', 'D', 'F']
    print(classification_report(y_test, pred, labels=labels_order, zero_division=0))
    print("Accuracy:", accuracy_score(y_test, pred))
    print("Macro Avg F1 Score:", f1_score(y_test, pred, average='macro', zero_division=0))

Part 2: Classification for Letter Grade

Logistic Regression

Classification Report:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         3
           B       0.50      0.68      0.58        19
           C       0.29      0.20      0.24        10
           D       0.00      0.00      0.00         3
           F       0.00      0.00      0.00         1

    accuracy                           0.42        36
   macro avg       0.16      0.18      0.16        36
weighted avg       0.34      0.42      0.37        36

Accuracy: 0.4166666666666667
Macro Avg F1 Score: 0.16261437908496731

Random Forest

Classification Report:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         3
           B       0.46      0.68      0.55        19
           C       0.29      0.20      0.24        10
           D       0.00      0.00      0.00         3
           F       0.00      0.00      0.00    

In [None]:
#part 3
#target for this part
y_reg = df['numscore']
X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
pipe = Pipeline([("preprocess", preprocess), ("model", model)])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print("Part 3: Regression Prediction \n")

mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared (R2):", r2)

Part 3: Regression Prediction 

MSE: 66.26188322222235
RMSE: 8.140140245857092
R-squared (R2): 0.052808629565246235
