In [64]:
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import pandas as pd

In [65]:
# load the dataset
df = pd.read_csv("text_training.csv", usecols=list(range(1,2002)))
print(df.shape)

# separate the features and target variable
features = df.iloc[:, 1:-1] # all columns except the last one (rating)
labels = df.iloc[:, -1] # last column (rating)

(2000, 2001)


In [66]:
# select the 10% best features
selector = SelectPercentile(percentile=10)
x = selector.fit_transform(features, labels)
support = selector.get_feature_names_out()

# Sanity check...
print(support.shape)

(200,)


  f = msb / msw


In [67]:
x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=1)

### Trying different models
#### Random forest

In [68]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8675


##### Network

In [69]:
mlp = MLPClassifier(max_iter=50, hidden_layer_sizes=[100, 50, 25], learning_rate='adaptive', random_state=1).fit(x_train, y_train)
y_pred = mlp.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8625




##### XGBooost classifier

In [70]:
xgbclf = xgb.XGBClassifier(max_depth=4, seed=2)
xgbclf.fit(x_train, y_train)
y_pred = xgbclf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9


Seems like XGBoost gave the best results!, we will use it for our prediction!

In [71]:
# load the dataset
df = pd.read_csv("text_Rollout_X.csv", usecols=list(range(1,2002)))
print(df.shape)

# separate the features and target variable
features = df.iloc[:, 1:-1] # all columns except the last one (rating)
labels = df.iloc[:, -1] # last column (rating)

(2000, 2001)


In [72]:
# select the 10% best features

# The features are filtered according to the selection made before
x = features[support]

In [73]:
# predict features for the test data using the chosen model
y_pred = xgbclf.predict(x)

In [74]:
# save predicted ratings and export
df['rating'] = y_pred
df.to_csv('text_rollout_X.csv')