# Mark Turos - Student Id: 9238806



In [None]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

## Subtask 1

In [None]:
# training data
filepath = 'lcp_single_train.tsv'
pd.set_option('display.max_colwidth',1000)
data = pd.read_csv(filepath, sep='\t')
data.head()

## data preprocessing

data['sentence'] = data['sentence'].str.lower()
data['token'] = data['token'].str.lower()

# feature extraction: token length, frequency
data['token_length'] = data['token'].apply(len)
token_counts = data['token'].value_counts()
data['token_frequency'] = data['token'].map(token_counts)

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['sentence'])


# combine TF-IDF + additional features
additional_features = data[['token_length', 'token_frequency']].values
features = np.hstack([tfidf_matrix.toarray(), additional_features])



## modelling

# train-test split
X_train, X_test, y_train, y_test = train_test_split(features, data['complexity'], test_size=0.2, random_state=42)

# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# paramter tuning

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# cross-validation to check for overfitting
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_mean = np.mean(-cv_scores)
cv_std = np.std(-cv_scores)
print(f'Cross-validated MSE: {cv_mean:.4f} ± {cv_std:.4f}')

# training with the best parameters
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

# evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')
print(f'R^2 Score: {r2:.4f}')


# -------------------------------------------------------------- #

# test data
test_file_path = 'lcp_single_test.tsv'
test_data = pd.read_csv(test_file_path, sep='\t')
test_data['sentence'] = test_data['sentence'].str.lower()
test_data['token'] = test_data['token'].str.lower()

# feature Extraction
test_data['token_length'] = test_data['token'].apply(len)

token_counts = test_data['token'].value_counts()
test_data['token_frequency'] = test_data['token'].map(token_counts)

# TF-IDF
tfidf_matrix_test = vectorizer.transform(test_data['sentence'])
additional_features_test = test_data[['token_length', 'token_frequency']].values
test_features = np.hstack([tfidf_matrix_test.toarray(), additional_features_test])

# scaling
test_features_scaled = scaler.transform(test_features)

# predictions
test_predictions = best_model.predict(test_features_scaled)

# assign predictions scores to test data
test_data['predicted_complexity'] = test_predictions

# download results to new .tsv
test_data.to_csv('lcp_single_test_with_predictions.tsv', sep='\t', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/content/lcp_single_train.tsv'

## Subtask 2

In [None]:
# training data
train_file_path = 'lcp_multi_train.tsv'
train_data = pd.read_csv(train_file_path, sep='\t')

# preprocessing
train_data['sentence'] = train_data['sentence'].str.lower()
train_data['token'] = train_data['token'].str.lower()

# feature extraction
train_data['token_length'] = train_data['token'].apply(len)
token_counts = train_data['token'].value_counts()
train_data['token_frequency'] = train_data['token'].map(token_counts)

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(train_data['sentence'])
additional_features = train_data[['token_length', 'token_frequency']].values
features = np.hstack([tfidf_matrix.toarray(), additional_features])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(features, train_data['complexity'], test_size=0.2, random_state=42)

# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# xgb model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# parameter tuning
param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

random_search = RandomizedSearchCV(estimator=xgb_model,
                                   param_distributions=param_dist,
                                   n_iter=20,  # Number of parameter settings sampled
                                   cv=5,
                                   n_jobs=-1,
                                   scoring='neg_mean_squared_error',
                                   random_state=42)

# fitting model w/ proper params
random_search.fit(X_train, y_train)
best_xgb_model = random_search.best_estimator_

# model training
best_xgb_model.fit(X_train, y_train)

# predictions on training data
y_pred = best_xgb_model.predict(X_test)

# eval
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'XGBoost - Mean Squared Error: {mse:.4f}')
print(f'XGBoost - R^2 Score: {r2:.4f}')


# -------------------------------------------------------------- #

# test data
test_file_path = 'lcp_multi_test.tsv'
test_data = pd.read_csv(test_file_path, sep='\t')

# data preprocessing and feature extraction + TFIDF
test_data['sentence'] = test_data['sentence'].str.lower()
test_data['token'] = test_data['token'].str.lower()

test_data['token_length'] = test_data['token'].apply(len)
token_counts = test_data['token'].value_counts()
test_data['token_frequency'] = test_data['token'].map(token_counts)

tfidf_matrix_test = vectorizer.transform(test_data['sentence'])
additional_features_test = test_data[['token_length', 'token_frequency']].values
test_features = np.hstack([tfidf_matrix_test.toarray(), additional_features_test])

test_features_scaled = scaler.transform(test_features)

# predictions
test_predictions = best_xgb_model.predict(test_features_scaled)

# assign predictions to the test data
test_data['predicted_complexity'] = test_predictions

# output resuilts to a new .tsv
test_data.to_csv('lcp_multi_test_with_predictions.tsv', sep='\t', index=False)


XGBoost - Mean Squared Error: 0.0179
XGBoost - R^2 Score: 0.2890
Top 10 important features (indices): [1062 1337 1323  469 3233 3140 2492  442  307 3489]
