In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [6]:
test_df = pd.read_csv('Test_Set.csv')
train_df = pd.read_csv('Train_Set.csv')

In [7]:
train_df = train_df.drop(columns=['Unnamed: 0'])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [8]:
def convert_to_int(df):
    num_cols_to_convert = df.shape[1] - 5
    cols_to_convert = df.columns[:num_cols_to_convert]
    dtype_dict = {col: np.int8 for col in cols_to_convert}
    df[cols_to_convert] = df[cols_to_convert].astype(dtype_dict)
    df.info()
    return df

train_df = convert_to_int(train_df)
test_df = convert_to_int(test_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937485 entries, 0 to 937484
Columns: 542 entries, city_Adams to binned_label
dtypes: float64(4), int8(537), object(1)
memory usage: 515.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234372 entries, 0 to 234371
Columns: 542 entries, city_Adams to binned_label
dtypes: float64(4), int8(537), object(1)
memory usage: 129.0+ MB


In [9]:
X_train = train_df.drop(columns=['label', 'binned_label'])
y_train = train_df['label']
y_binned_train = train_df['binned_label']

X_test = test_df.drop(columns=['label', 'binned_label'])
y_test = test_df['label']
y_binned_test = test_df['binned_label']

## Classification Models

### Logistic Regression

In [21]:
logistic_model = OneVsRestClassifier(LogisticRegression(solver='liblinear',  max_iter=10000))
logistic_model.fit(X_train, y_binned_train)

In [22]:
y_pred = logistic_model.predict(X_test)

In [23]:
print("Accuracy: ", accuracy_score(y_binned_test, y_pred))
print("Precision: ", precision_score(y_binned_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_binned_test, y_pred, average='macro'))
print("F1 Score: ", f1_score(y_binned_test, y_pred, average='macro'))
print("Confusion Matrix: \n", classification_report(y_binned_test, y_pred))

Accuracy:  0.5390831669312034
Precision:  0.5553112509792538
Recall:  0.5474109669110186
F1 Score:  0.5478993591474152
Confusion Matrix: 
                  precision    recall  f1-score   support

1425k - 875000k       0.69      0.78      0.73     23332
    188k - 373k       0.50      0.63      0.56     58609
       1k - 89k       0.62      0.55      0.58     23886
    374k - 699k       0.53      0.50      0.52     58572
   700k - 1424k       0.49      0.43      0.46     35256
     90k - 187k       0.50      0.39      0.44     34717

       accuracy                           0.54    234372
      macro avg       0.56      0.55      0.55    234372
   weighted avg       0.54      0.54      0.53    234372



### Trees

#### CART (classification)

In [24]:
cart_tree_classifier = DecisionTreeClassifier()

In [25]:
cart_tree_classifier.fit(X_train, y_binned_train)

In [26]:
y_pred = cart_tree_classifier.predict(X_test)

In [27]:
print("Accuracy: ", accuracy_score(y_binned_test, y_pred))
print("Precision: ", precision_score(y_binned_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_binned_test, y_pred, average='macro'))
print("F1 Score: ", f1_score(y_binned_test, y_pred, average='macro'))
print("Confusion Matrix: \n", classification_report(y_binned_test, y_pred))

Accuracy:  0.9913257556363388
Precision:  0.9924715970084791
Recall:  0.9915855823776972
F1 Score:  0.9920137100788048
Confusion Matrix: 
                  precision    recall  f1-score   support

1425k - 875000k       1.00      0.99      1.00     23332
    188k - 373k       0.99      0.99      0.99     58609
       1k - 89k       1.00      1.00      1.00     23886
    374k - 699k       0.99      0.99      0.99     58572
   700k - 1424k       0.99      0.98      0.99     35256
     90k - 187k       0.99      1.00      0.99     34717

       accuracy                           0.99    234372
      macro avg       0.99      0.99      0.99    234372
   weighted avg       0.99      0.99      0.99    234372



#### Random Forest (classification)

In [30]:
random_forest_classifier = RandomForestClassifier(n_estimators=10, random_state=42)

In [33]:
random_forest_classifier.fit(X_train, y_binned_train)

In [34]:
y_pred = random_forest_classifier.predict(X_test)

In [35]:
print("Accuracy: ", accuracy_score(y_binned_test, y_pred))
print("Precision: ", precision_score(y_binned_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_binned_test, y_pred, average='macro'))
print("F1 Score: ", f1_score(y_binned_test, y_pred, average='macro'))
print("Confusion Matrix: \n", classification_report(y_binned_test, y_pred))

Accuracy:  0.9900798730223747
Precision:  0.9912096174191364
Recall:  0.9903096742979584
F1 Score:  0.9907427180266319
Confusion Matrix: 
                  precision    recall  f1-score   support

1425k - 875000k       0.99      1.00      1.00     23332
    188k - 373k       0.99      0.99      0.99     58609
       1k - 89k       1.00      0.99      1.00     23886
    374k - 699k       0.98      0.99      0.99     58572
   700k - 1424k       0.99      0.97      0.98     35256
     90k - 187k       0.99      0.99      0.99     34717

       accuracy                           0.99    234372
      macro avg       0.99      0.99      0.99    234372
   weighted avg       0.99      0.99      0.99    234372



### Naive Bayes

In [2]:
naive_bayes = GaussianNB()

In [10]:
naive_bayes.fit(X_train, y_binned_train)

In [11]:
y_pred = naive_bayes.predict(X_test)

In [12]:
print("Accuracy: ", accuracy_score(y_binned_test, y_pred))
print("Precision: ", precision_score(y_binned_test, y_pred, average='macro'))
print("Recall: ", recall_score(y_binned_test, y_pred, average='macro'))
print("F1 Score: ", f1_score(y_binned_test, y_pred, average='macro'))
print("Confusion Matrix: \n", classification_report(y_binned_test, y_pred))

Accuracy:  0.24803730821087844
Precision:  0.3479079658771754
Recall:  0.3525784141266148
F1 Score:  0.20749280818880625
Confusion Matrix: 
                  precision    recall  f1-score   support

1425k - 875000k       0.20      0.95      0.34     23332
    188k - 373k       0.55      0.04      0.08     58609
       1k - 89k       0.23      0.83      0.36     23886
    374k - 699k       0.62      0.14      0.23     58572
   700k - 1424k       0.24      0.09      0.14     35256
     90k - 187k       0.24      0.07      0.10     34717

       accuracy                           0.25    234372
      macro avg       0.35      0.35      0.21    234372
   weighted avg       0.41      0.25      0.18    234372



## Regression Models

### Linear Regression

In [36]:
model = LinearRegression()

In [37]:
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)

In [39]:
r2 = r2_score(y_test, y_pred)

In [40]:
print(f"R-squared: {r2:.4f}")

R-squared: 0.1851


### Trees 

#### CART

In [41]:
cart_tree_reg = DecisionTreeRegressor(ccp_alpha=0.01, criterion='squared_error') 

In [42]:
cart_tree_reg.fit(X_train, y_train)

In [43]:
y_pred = cart_tree_reg.predict(X_test)

In [44]:
r2_trees = r2_score(y_test, y_pred)

In [45]:
print(f"R-squared: {r2_trees:.4f}")

R-squared: 0.9875


#### Random Forest

In [46]:
random_forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
random_forest_reg.fit(X_train, y_train)

In [47]:
y_pred = random_forest_reg.predict(X_test)

In [48]:
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")

R-squared: 0.9891
