# Imports and Installs

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Data Loading

In [None]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
data = data.drop(columns=['Name', 'id'])
test_data = test_data.drop(columns=['Name'])

# Data Preprocessing

## Data Cleaning

In [None]:
for d in [data, test_data]:
  for column in d.columns:
    if d.isna().sum()[column] > 0:
      mode_value = d[column].dropna().mode()[0]
      d[column].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  d[column].fillna(mode_value, inplace=True)


## Encoding

In [None]:
nominal_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
data_ohe = pd.DataFrame(ohe.fit_transform(data[nominal_cols]), columns=ohe.get_feature_names_out(nominal_cols), index=data.index)
test_data_ohe = pd.DataFrame(ohe.transform(test_data[nominal_cols]), columns=ohe.get_feature_names_out(nominal_cols), index=test_data.index)
data = data.drop(columns=nominal_cols).join(data_ohe)
test_data = test_data.drop(columns=nominal_cols).join(test_data_ohe)

## Train-Test Split

In [None]:
# Train-test split already performed by Kaggle
X_train = data.drop(columns=["Depression"])
y_train = data['Depression']
X_test = test_data

## Normalisation

In [None]:
for col in X_train.columns:
  if col not in nominal_cols:
    mu = X_train[col].mean()
    sigma = X_train[col].std()
    X_train[col] = (X_train[col] - mu) / sigma
    X_test[col] = (X_test[col] - mu) / sigma

# Model Handling

## Model Selection

In [None]:
model = LogisticRegression(random_state=42)

## Model Evaluation

In [None]:
def cross_validate(model, X_train, y_train, cv=5):
  skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
  accuracy_scores = []

  for train_index, test_index in skf.split(X_train, y_train):
  # Model training
      X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
      y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
      model.fit(X_train_fold, y_train_fold)

  # Model predictions
      y_pred = model.predict(X_test_fold)
      accuracy = accuracy_score(y_test_fold, y_pred)
      accuracy_scores.append(accuracy)

  # Performance summary
  return np.mean(accuracy_scores)

cross_validate(model, X_train, y_train)

0.9382871357498223

## Model Tuning

### Hyperparameter Tuning

In [None]:
params = {
    'max_iter': 1000,
    'class_weight': 'balanced'
    }

### Re-evaluation

In [None]:
model_ht = LogisticRegression(**params)
cross_validate(model, X_train, y_train)

0.9382871357498223

# Submission

In [None]:
# Without hyperparameter tuning
model.fit(X_train, y_train)
test_predictions = model.predict(X_test.drop(columns=['id']))
submission = pd.DataFrame({'id': X_test["id"], 'Depression': test_predictions})
submission.to_csv('basic_submission.csv', index=False)

In [None]:
# With hyperparamter tuning
model_ht.fit(X_train, y_train)
test_predictions = model_ht.predict(X_test.drop(columns=['id']))
submission = pd.DataFrame({'id': X_test["id"], 'Depression': test_predictions})
submission.to_csv('tuned_submission.csv', index=False)