In [2]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# load the dataset
df = pd.read_csv("text_training.csv", usecols=list(range(1,2002)))

# separate the features and target variable
X = df.iloc[:, 1:-1] # all columns except the last one (rating)
y = df.iloc[:, -1] # last column (rating)

### performing different models
- feature selection
- test 40%

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [5]:
# perform feature selection
selector = SelectKBest(score_func=mutual_info_classif, k=50) # select top 50 features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [6]:
def show_results(y_test, y_pred):
    # calculate the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # extract true positives, true negatives, false positives, false negatives
    tn, fp, fn, tp = conf_matrix.ravel()

    # calculate the score
    score = (tp + tn) * 100 / (tp + tn + fp + fn)

    print("Score: ", score)

#### Logistic regression

In [7]:
# train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)


# make predictions on the test set
y_pred = clf.predict(X_test)
show_results(y_test, y_pred)

Score:  66.75


#### Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
show_results(y_test, y_pred)

Score:  64.625


#### SVC

In [9]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
show_results(y_test, y_pred)

Score:  66.25


### Using cross validation - 8.5 average
- cross validation
- feature selection
- logistic regression

In [10]:
from sklearn.model_selection import cross_val_score

# define the model
clf = LogisticRegression()

# perform k-fold cross-validation
scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')

# print the average accuracy
print("Average accuracy: ", scores.mean())
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average accuracy:  0.8564999999999999


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.865, 0.86 , 0.845, 0.885, 0.875, 0.815, 0.855, 0.86 , 0.84 ,
       0.865])

scores returned on 
array([0.865, 0.86 , 0.845, 0.885, 0.875, 0.815, 0.855, 0.86 , 0.84 ,
       0.865])

### Using cross validation with feature selection - Bad results - 0.7~

In [11]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import cross_val_score

# perform feature selection
selector = SelectKBest(score_func=mutual_info_classif, k=100) # select top 50 features
X_new = selector.fit_transform(X, y)

# define the model
clf = LogisticRegression()

# perform k-fold cross-validation
scores = cross_val_score(clf, X_new, y, cv=10, scoring='accuracy')

# print the average accuracy
print("Average accuracy: ", scores.mean())

Average accuracy:  0.76


In [62]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB



models = [LogisticRegression(),
         SVC(kernel='linear'),
         RandomForestClassifier(),
         XGBClassifier(),
         MultinomialNB()
         ]

for model in models:

    kf = KFold(n_splits=15, shuffle=True, random_state=42)
    score_lst = []

    for index, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Train the model on X_train and y_train
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        # evaluate the model on X_val and y_val
        score = accuracy_score(y_val, y_pred)
        score_lst.append(score)

    print(f"{type(model)}'s average score: {round(sum(score_lst) *100/ len(score_lst), 2)}")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

<class 'sklearn.linear_model._logistic.LogisticRegression'>'s average score: 85.35
<class 'sklearn.svm._classes.SVC'>'s average score: 82.61
<class 'sklearn.ensemble._forest.RandomForestClassifier'>'s average score: 82.9
<class 'xgboost.sklearn.XGBClassifier'>'s average score: 84.65
<class 'sklearn.naive_bayes.MultinomialNB'>'s average score: 84.7


After figuring out the Logistic regression has the best results, We wil use it for or final prediction. 

In [35]:
from tqdm import tqdm
itr_count = 50
clf = LogisticRegression()
best_settings = {"K_best_variables": -1,
                 "Test_size": -1,
                 "Score": -1}
for size in tqdm(range(1, 10, 1)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size / 10, random_state=42)

    for k in tqdm(range(1, itr_count + 1)):

        selector = SelectKBest(score_func=mutual_info_classif, k=k) # select top 50 features
        
        selector.fit(X_train, y_train)

        # Transform the training and test sets to select the k best features
        X_train_kbest = selector.transform(X_train)
        X_test_kbest = selector.transform(X_test)

        clf.fit(X_train_kbest, y_train)
        # make predictions on the test set
        y_pred = clf.predict(X_test_kbest)

        # show_results(y_test, y_pred)
        score = accuracy_score(y_test, y_pred)
        if score > best_settings["Score"]:
            best_settings["Score"] = score
            best_settings["K_best_variables"] = k
            best_settings["Test_size"] = size
            print(best_settings)
            show_results(y_test, y_pred)

  0%|          | 0/9 [00:00<?, ?it/s]

{'K_best_variables': 1, 'Test_size': 1, 'Score': 0.615}
Score:  61.5




{'K_best_variables': 2, 'Test_size': 1, 'Score': 0.65}
Score:  65.0




{'K_best_variables': 8, 'Test_size': 1, 'Score': 0.675}
Score:  67.5




{'K_best_variables': 13, 'Test_size': 1, 'Score': 0.695}
Score:  69.5




{'K_best_variables': 22, 'Test_size': 1, 'Score': 0.7}
Score:  70.0




{'K_best_variables': 25, 'Test_size': 1, 'Score': 0.715}
Score:  71.5




{'K_best_variables': 28, 'Test_size': 1, 'Score': 0.73}
Score:  73.0




{'K_best_variables': 36, 'Test_size': 1, 'Score': 0.745}
Score:  74.5




{'K_best_variables': 46, 'Test_size': 1, 'Score': 0.765}
Score:  76.5


100%|██████████| 50/50 [06:36<00:00,  7.94s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 50/50 [05:57<00:00,  7.15s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 50/50 [05:25<00:00,  6.51s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the da

In [36]:
from tqdm import tqdm
itr_count = 300
clf = LogisticRegression()
best_settings = {"K_best_variables": -1,
                 "Test_size": -1,
                 "Score": -1}
for size in tqdm(range(1, 10, 1)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size / 10, random_state=42)

    for k in tqdm(range(1, itr_count + 1)):

        selector = SelectKBest(score_func=mutual_info_classif, k=k) # select top 50 features
        
        selector.fit(X_train, y_train)

        # Transform the training and test sets to select the k best features
        X_train_kbest = selector.transform(X_train)
        X_test_kbest = selector.transform(X_test)

        clf.fit(X_train_kbest, y_train)
        # make predictions on the test set
        y_pred = clf.predict(X_test_kbest)

        # show_results(y_test, y_pred)
        score = accuracy_score(y_test, y_pred)
        if score > best_settings["Score"]:
            best_settings["Score"] = score
            best_settings["K_best_variables"] = k
            best_settings["Test_size"] = size
            print(best_settings)
            show_results(y_test, y_pred)

  0%|          | 0/9 [00:00<?, ?it/s]

{'K_best_variables': 1, 'Test_size': 1, 'Score': 0.595}
Score:  59.5




{'K_best_variables': 2, 'Test_size': 1, 'Score': 0.62}
Score:  62.0




{'K_best_variables': 3, 'Test_size': 1, 'Score': 0.655}
Score:  65.5




{'K_best_variables': 7, 'Test_size': 1, 'Score': 0.665}
Score:  66.5




{'K_best_variables': 19, 'Test_size': 1, 'Score': 0.675}
Score:  67.5




{'K_best_variables': 20, 'Test_size': 1, 'Score': 0.71}
Score:  71.0




{'K_best_variables': 24, 'Test_size': 1, 'Score': 0.72}
Score:  72.0




{'K_best_variables': 34, 'Test_size': 1, 'Score': 0.725}
Score:  72.5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'K_best_variables': 45, 'Test_size': 1, 'Score': 0.73}
Score:  73.0




{'K_best_variables': 48, 'Test_size': 1, 'Score': 0.745}
Score:  74.5




{'K_best_variables': 50, 'Test_size': 1, 'Score': 0.77}
Score:  77.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'K_best_variables': 60, 'Test_size': 1, 'Score': 0.775}
Score:  77.5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'K_best_variables': 83, 'Test_size': 1, 'Score': 0.78}
Score:  78.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'K_best_variables': 121, 'Test_size': 1, 'Score': 0.795}
Score:  79.5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'K_best_variables': 134, 'Test_size': 1, 'Score': 0.82}
Score:  82.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'K_best_variables': 185, 'Test_size': 1, 'Score': 0.83}
Score:  83.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'K_best_variables': 300, 'Test_size': 1, 'Score': 0.835}
Score:  83.5


  4%|▍         | 12/300 [01:29<35:37,  7.42s/it]
 11%|█         | 1/9 [40:46<5:26:15, 2446.93s/it]


KeyboardInterrupt: 

In [45]:
import json

def create_key(dict):
    lst = list(dict.values())
    return f"{lst[0]}-{lst[1]}"

def write_json(dict):
    with open("data.json", "w") as infile:
        data = json.load(infile)
        data[create_key(dict)] = dict["Score"]
        json.dump(data, infile)

def read_json(dict):
    with open("data.json", "r") as infile:
        data = json.load(infile)
        return data[create_key(dict)]

In [63]:
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB

itr_count = 150
clf = LogisticRegression()
best_settings = {"K_best_variables": -1,
                 "Test_size": -1,
                 "Score": -1}
for size in range(3, 10, 1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size / 10, random_state=42)

    for k in tqdm(range(150, 150 + itr_count + 1)):

        selector = SelectKBest(score_func=mutual_info_classif, k=k) # select top 50 features
        
        selector.fit(X_train, y_train)

        # Transform the training and test sets to select the k best features
        X_train_kbest = selector.transform(X_train)
        X_test_kbest = selector.transform(X_test)

        clf.fit(X_train_kbest, y_train)
        # make predictions on the test set
        y_pred = clf.predict(X_test_kbest)

        # show_results(y_test, y_pred)
        score = accuracy_score(y_test, y_pred)
        d = {"Test_size": size, "K_best_variables": k, "Score": score}
        #write_json(d)
        if score > best_settings["Score"]:
            best_settings["Score"] = score
            best_settings["K_best_variables"] = k
            best_settings["Test_size"] = size
            print(best_settings)
            show_results(y_test, y_pred)

  1%|          | 1/151 [00:06<15:54,  6.36s/it]

{'K_best_variables': 150, 'Test_size': 3, 'Score': 0.7133333333333334}
Score:  71.33333333333333


  1%|▏         | 2/151 [00:12<15:42,  6.33s/it]

{'K_best_variables': 151, 'Test_size': 3, 'Score': 0.7866666666666666}
Score:  78.66666666666667


  8%|▊         | 12/151 [01:16<14:55,  6.44s/it]

{'K_best_variables': 161, 'Test_size': 3, 'Score': 0.7966666666666666}
Score:  79.66666666666667


 12%|█▏        | 18/151 [01:55<14:20,  6.47s/it]