In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression, Lasso , Ridge ,LogisticRegression
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,accuracy_score,confusion_matrix,recall_score,precision_score,root_mean_squared_error,root_mean_squared_log_error
from sklearn.metrics import f1_score,classification_report,roc_curve,roc_auc_score
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,OneHotEncoder , LabelEncoder,MinMaxScaler
from sklearn import linear_model
import math
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb,XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC,SVR
from sklearn.tree import DecisionTreeRegressor , plot_tree, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier,BaggingClassifier,AdaBoostClassifier , RandomForestRegressor
from sklearn.datasets import load_iris
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)

In [34]:
import os
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [35]:
def clean_text(text, use_stemming=True):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    if use_stemming:
        stemmer = PorterStemmer()
        text = ' '.join([stemmer.stem(word) for word in text.split()])
    else:
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
neg_dir = "neg"
pos_dir = "pos"

In [None]:
def read_and_clean_data(directory, use_stemming=True):
    data = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text, use_stemming)
            data.append(cleaned_text)

    return data

neg_data = read_and_clean_data(neg_dir, use_stemming=True)
pos_data = read_and_clean_data(pos_dir, use_stemming=True)

In [36]:
vectorizer = TfidfVectorizer(max_features=5000)
X_neg = vectorizer.fit_transform(neg_data)
X_pos = vectorizer.transform(pos_data)

from scipy import sparse
X = sparse.vstack([X_neg, X_pos])

y = [0] * len(neg_data) + [1] * len(pos_data)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print ("*"*100)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

best_model = LogisticRegression(**best_params)
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Improved Accuracy: {accuracy_best}')
print('Improved Classification Report:')
print(classification_report(y_test, y_pred_best))

Accuracy: 0.8125
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.81       199
           1       0.82      0.81      0.81       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

****************************************************************************************************




Best Parameters: {'C': 10, 'solver': 'liblinear'}
Improved Accuracy: 0.835
Improved Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       199
           1       0.83      0.84      0.84       201

    accuracy                           0.83       400
   macro avg       0.84      0.83      0.83       400
weighted avg       0.84      0.83      0.83       400





In [39]:
from sklearn.naive_bayes import MultinomialNB

model1 = MultinomialNB()
model1.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print ("*"*100)

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

best_model = MultinomialNB(**best_params)
best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Improved Accuracy: {accuracy_best}')
print('Improved Classification Report:')
print(classification_report(y_test, y_pred_best))

Accuracy: 0.8125
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.81       199
           1       0.82      0.81      0.81       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

****************************************************************************************************
Best Parameters: {'alpha': 0.5}
Improved Accuracy: 0.8
Improved Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.71      0.78       199
           1       0.76      0.89      0.82       201

    accuracy                           0.80       400
   macro avg       0.81      0.80      0.80       400
weighted avg       0.81      0.80      0.80       400

