# MS2
- Make a Baseline learning notebook carrying, some sort of linear or logistic regression (to be used as a benchmark; feel free to use sklearn).  Details left to you, but explain what you are doing in text cells in the notebook.

In [None]:
# Imports
import pandas as pd
from datetime import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import gdown
import os

**Loading data**

In [None]:
file_id = "1wVHHFxZJ8AWwwXeD2BQs545AKZfwmlHN"
training_datapath = "cleaned_full_data.pkl"
if not os.path.exists(training_datapath):
    print("Downloading data from Google Drive...")
    gdown.download(f"https://drive.google.com/uc?id={file_id}", training_datapath, quiet=False)

Downloading data from Google Drive...


Downloading...
From (original): https://drive.google.com/uc?id=1wVHHFxZJ8AWwwXeD2BQs545AKZfwmlHN
From (redirected): https://drive.google.com/uc?id=1wVHHFxZJ8AWwwXeD2BQs545AKZfwmlHN&confirm=t&uuid=bf989245-56e5-4659-b68a-cb9db30d9ec7
To: /content/cleaned_full_data.pkl
100%|██████████| 117M/117M [00:01<00:00, 75.4MB/s]


**Selected Features**

In [None]:
included_columns = ["CASE_STATUS", "RECEIVED_DATE", "DECISION_DATE", "EMPLOYER_STATE_PROVINCE", "EMPLOYER_CITY", "WORKSITE_STATE", "WORKSITE_CITY", "COUNTRY_OF_CITIZENSHIP", "EMPLOYER_NUM_EMPLOYEES"]

In [None]:
training_data = pd.read_pickle(training_datapath)[included_columns]

In [None]:
def getDays(datetime_val):
    if type(datetime_val) != datetime:
        return datetime_val
    return datetime_val.days
training_data["CASE_STATUS_FLAG"] = training_data["CASE_STATUS"].apply(lambda x: 1 if "Certified" == x else 0)
training_data['CASE_APPROVAL_LENGTH'] = training_data['DECISION_DATE'] - training_data['RECEIVED_DATE']
training_data['CASE_APPROVAL_LENGTH'] = training_data['CASE_APPROVAL_LENGTH'].apply(getDays)
for col in training_data.select_dtypes(include=['datetime64']).columns:
    training_data[col] = training_data[col].astype('int64') // 10**9
training_data = training_data.dropna()

In [None]:
print(training_data)#.head())

              CASE_STATUS  RECEIVED_DATE  DECISION_DATE  \
0       Certified-Expired     1619049600     1633046400   
1       Certified-Expired     1618876800     1633046400   
2       Certified-Expired     1605484800     1633046400   
3       Certified-Expired     1618531200     1633046400   
4       Certified-Expired     1618272000     1633046400   
...                   ...            ...            ...   
104594             Denied     1643414400     1664496000   
104595             Denied     1643414400     1664496000   
104596             Denied     1643414400     1664496000   
104597             Denied     1643414400     1664496000   
104598             Denied     1643155200     1664496000   

       EMPLOYER_STATE_PROVINCE  EMPLOYER_CITY WORKSITE_STATE WORKSITE_CITY  \
0                     NEW YORK       NEW YORK       NEW YORK      New York   
1                     ILLINOIS      CHAMPAIGN   PENNSYLVANIA    Pittsburgh   
2                     NEW YORK       New York       NEW Y

## Linear Regression
- Performing linear regression using our selected features.

In [None]:
# Preprocessing
# Assuming 'CASE_STATUS_FLAG' is the dependent variable (y)
X_Linear_Regression = training_data.drop(columns=['CASE_APPROVAL_LENGTH', 'CASE_STATUS', 'CASE_STATUS_FLAG', 'DECISION_DATE', 'RECEIVED_DATE'], axis=1)  # Drop target and related column
y_Linear_Regression = training_data['CASE_APPROVAL_LENGTH']  # Target variable
y_Linear_Regression = y_Linear_Regression.dt.days

# Handle categorical variables by encoding them
label_encoders = {}
for column in X_Linear_Regression.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_Linear_Regression[column] = le.fit_transform(X_Linear_Regression[column].astype(str))
    label_encoders[column] = le

# Split the data
X_Linear_Regression_train, X_Linear_Regression_test, y_Linear_Regression_train, y_Linear_Regression_test = train_test_split(X_Linear_Regression, y_Linear_Regression, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_Linear_Regression_train, y_Linear_Regression_train)

# Evaluate the model
y_pred_Linear_Regression = model.predict(X_Linear_Regression_test)

# Calculate metrics
mse = mean_squared_error(y_Linear_Regression_test, y_pred_Linear_Regression)
r2 = r2_score(y_Linear_Regression_test, y_pred_Linear_Regression)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 11957.682940090239
R² Score: 0.01016860871860592


## Logistic Regression
- Performing Logistic Regression with our selected data.

In [None]:
# Preprocessing
X_Logistic_Regression = training_data.drop(columns=['CASE_STATUS_FLAG', 'CASE_STATUS'], axis=1)  # Drop target and related column
X_Logistic_Regression['CASE_APPROVAL_LENGTH'] = X_Logistic_Regression['CASE_APPROVAL_LENGTH'].dt.days
y_Logistic_Regression = training_data['CASE_STATUS_FLAG']  # Target variable

# Handle categorical variables by encoding them
label_encoders = {}
for column in X_Logistic_Regression.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_Logistic_Regression[column] = le.fit_transform(X_Logistic_Regression[column].astype(str))
    label_encoders[column] = le

# Split the data
X_Logistic_Regression_train, X_Logistic_Regression_test, y_Logistic_Regression_train, y_Logistic_Regression_test = train_test_split(
    X_Logistic_Regression, y_Logistic_Regression, test_size=0.2, random_state=42
)

# Train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_Logistic_Regression_train, y_Logistic_Regression_train)

# Evaluate the model
y_pred_Logistic_Regression = logistic_model.predict(X_Logistic_Regression_test)

# Calculate metrics
accuracy = accuracy_score(y_Logistic_Regression_test, y_pred_Logistic_Regression)
conf_matrix = confusion_matrix(y_Logistic_Regression_test, y_pred_Logistic_Regression)
class_report = classification_report(y_Logistic_Regression_test, y_pred_Logistic_Regression)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)


Accuracy: 0.5997226605460718

Confusion Matrix:
 [[12301   629]
 [ 7742   241]]

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.95      0.75     12930
           1       0.28      0.03      0.05      7983

    accuracy                           0.60     20913
   macro avg       0.45      0.49      0.40     20913
weighted avg       0.49      0.60      0.48     20913

