In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import xgboost as xgb

Notes about data:

-Has 95 rows of integers with NaNs in them

-Then last 5 rows of Letters from : ['G', 'H', 'I', 'J', 'K', 'N', 'a']

In [7]:
df = pd.read_csv('case1Data.txt')
df.replace(to_replace=r'^\s*NaN\s*$', value=np.nan, regex=True, inplace=True)  # Case-insensitive, trims whitespace


# Replace NaNs with 0
df.fillna(0, inplace=True)

contains_string = df.applymap(lambda x: isinstance(x, str)).any().any()

print(f"DataFrame contains string values: {contains_string}")

DataFrame contains string values: True


In [8]:
last_five_columns = df.iloc[:, -5:]
concatenated_text = ''.join(last_five_columns.astype(str).values.flatten())

unique_letters = set(letter for letter in concatenated_text if letter.isalpha())

print(unique_letters)
# Letters that appear and their mapping to numbers
letters = ['G', 'H', 'I', 'J', 'K', 'N', 'a']
mapping = {letter: i+1.0 for i, letter in enumerate(letters)}

# Apply the mapping to the last 5 columns of the DataFrame
for col in df.iloc[:, -5:].columns:
    df[col] = df[col].apply(lambda x: ''.join(str(mapping.get(letter, letter)) for letter in str(x)))

# Convert everything to floats
for column in df.columns:
    df[column] = pd.to_numeric(df[column], errors='ignore')
X = df.drop('y', axis=1) 
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

{'I', 'G', 'J', 'H', 'K'}


Linear Regression

In [10]:


model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 43.50576556621277
Mean Squared Error (MSE): 3376.839544773491
Root Mean Squared Error (RMSE): 58.110580316956835
R-squared (R²): -0.022878824551715526


Random Forest Regression

In [19]:
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 31.814562712999997
Mean Squared Error (MSE): 1523.404525637632
Root Mean Squared Error (RMSE): 39.03081507780272
R-squared (R²): 0.5385453736133891


Support Vector Regression (with gridsearch for hyperparameters)

In [23]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.5],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring=rmse_scorer)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)  # Negate because we set greater_is_better=False

Best Parameters: {'C': 0.1, 'epsilon': 0.5, 'kernel': 'linear'}
Best RMSE: 38.882985457986436
Mean Absolute Error (MAE): 45.44650564064601
Mean Squared Error (MSE): 3262.089955021803
Root Mean Squared Error (RMSE): 57.11470874496168
R-squared (R²): 0.011879985846802166


XgBoost (with gridsearch)

In [26]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, 
                           scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.9}
Best RMSE: 34.30831454395398
