In [12]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd

In [13]:
# load the dataset
df = pd.read_csv("text_training.csv", usecols=list(range(1,2002)))

# separate the features and target variable
X = df.iloc[:, 1:-1] # all columns except the last one (rating)
y = df.iloc[:, -1] # last column (rating)

### performing different models
- feature selection
- test 40%

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [15]:
# perform feature selection
selector = SelectKBest(score_func=mutual_info_classif, k=50) # select top 50 features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [16]:
def show_results(y_test, y_pred):
    # calculate the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # extract true positives, true negatives, false positives, false negatives
    tn, fp, fn, tp = conf_matrix.ravel()

    # calculate the score
    score = (tp + tn) * 100 / (tp + tn + fp + fn)

    print("Score: ", score)

#### Logistic regression

In [17]:
# train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)


# make predictions on the test set
y_pred = clf.predict(X_test)
show_results(y_test, y_pred)

Score:  67.375


#### Random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
show_results(y_test, y_pred)

Score:  64.125


#### SVC

In [19]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
show_results(y_test, y_pred)

Score:  65.375


### Using cross validation - 8.5 average
- cross validation
- feature selection
- logistic regression

In [20]:
from sklearn.model_selection import cross_val_score

# define the model
clf = LogisticRegression()

# perform k-fold cross-validation
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')

# print the average accuracy
print("Average accuracy: ", scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average accuracy:  0.8564999999999999


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.865, 0.86 , 0.845, 0.885, 0.875, 0.815, 0.855, 0.86 , 0.84 ,
       0.865])

scores returned on 
array([0.865, 0.86 , 0.845, 0.885, 0.875, 0.815, 0.855, 0.86 , 0.84 ,
       0.865])

### Using cross validation with feature selection - Bad results - 0.7~

In [21]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score

# perform feature selection
selector = SelectKBest(score_func=mutual_info_classif, k=50) # select top 50 features
X_new = selector.fit_transform(X, y)

# define the model
clf = LogisticRegression()

# perform k-fold cross-validation
scores = cross_val_score(clf, X_new, y, cv=10, scoring='accuracy')

# print the average accuracy
print("Average accuracy: ", scores.mean())

Average accuracy:  0.741


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
#clf = RandomForestClassifier()

kf = KFold(n_splits=40, shuffle=True, random_state=42)
best_fold_index = -1
best_score = -1
best_train_index = []
best_val_index = []
for index, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train the model on X_train and y_train
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    # evaluate the model on X_val and y_val
    score = accuracy_score(y_val, y_pred)
    if score > best_score:
        best_score = score
        best_fold_index = index
        best_train_index, best_val_index = train_index, val_index

print(best_score, best_fold_index)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

1.0 23


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
