In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #remove tag
    text = re.sub('@[^\s]+','', text)

    #remove link
    text = re.sub('((www\.[^\s]+)|(http?://[^\s]+))','',text)

    # Remove punctuation
    text = re.sub("[^a-zA-Z]", " ", text)

    #lowercase and tokenize
    tokens = nltk.word_tokenize(text.lower())

    #remove stopwrods
    tokens = [word for word in tokens if word not in stop_words]  

    #stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

In [3]:
df_train = pd.read_csv("../train_aug.csv")
df_train['text'] = df_train['text'].astype(str).apply(preprocess)
df_train.dropna(inplace=True)
df_train.drop_duplicates(subset='text', inplace=True)

df_test = pd.read_csv("../test_aug.csv")
df_test['text'] = df_test['text'].astype(str).apply(preprocess)
df_test.dropna(inplace=True)
df_test.drop_duplicates(subset='text', inplace=True)

In [None]:
# Define the pipeline with CountVectorizer and MultinomialNB
pipeline = make_pipeline(
    CountVectorizer(stop_words='english', ngram_range=(1, 2)),
    MultinomialNB()
)

parameters = {
    'countvectorizer__max_features': [1000, 2000, 3000, 5000],  
    'countvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],   
    'multinomialnb__alpha': [0.5, 1.0, 1.5, 2.0],               
    'multinomialnb__fit_prior': [True, False]                  
}

# grid search cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(df_train['text'], df_train['sentiment'])

# best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# fit the final model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(df_train['text'], df_train['sentiment'])

In [None]:
y_pred = best_model.predict(df_test['text'])

In [None]:
print(classification_report(df_test['sentiment'], y_pred))

In [None]:
import joblib 

joblib.dump(best_model, 'count_mnb_aug.joblib')