In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # avoid slide-copy-warning 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
DATASET_CONFIGURATION = {
    "path"     : "data/train.csv",
    "columns"  : ["label", "ids", "date", "flag", "user", "text"],
    "encoding" : "ISO-8859-1",
    "test_size": 0.2,
}

SAMPLE_SIZE = 200000

### Reading the data

The data is arranged as follows
    
- *label*: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)<br>
- *ids*: The id of the tweet (2087)<br>
- *date*: the date of the tweet (Sat May 16 23:58:44 UTC 2009)<br>
- *flag*: The query (lyx). If there is no query, then this value is NO_QUERY.<br>
- *user*: the user that tweeted (robotickilldozr)<br>
- *text*: the text of the tweet (Lyx is cool)<br>

You can find and download the data from [here](https://www.kaggle.com/datasets/kazanova/sentiment140).

In [3]:
data_frame = pd.read_csv(DATASET_CONFIGURATION["path"], 
                encoding=DATASET_CONFIGURATION["encoding"], 
                   names=DATASET_CONFIGURATION["columns"])

sampled_data = data_frame.sample(n=SAMPLE_SIZE, random_state=42) # sampling because my computer is slow

### Splitting the data

In [4]:
data_train, data_test = train_test_split(sampled_data, test_size=DATASET_CONFIGURATION["test_size"], random_state=42)

### Cleaning the data

In [5]:
y_train = data_train['label']

columns_to_drop = ['label', 'ids', 'date', 'flag', 'user']
data_train = data_train.drop(columns=columns_to_drop)
data_train

Unnamed: 0,text
447515,@biglime same I googled it and its the same f...
1119902,@stinamfking I remembered. I think. Penny Lane...
19368,i am depressed again. i miss home so much rig...
1468176,@andrewjennings i think someones excited about...
1048930,@greggarbo One of my favorite songs. australi...
...,...
1114266,@samanthai Awww *blush* Thanks!! You are a god...
408466,"@therealTiffany awe, I'm sorry I hope you fee..."
1522785,"@AureliusTjin Haha, yeah. Your name is very u..."
387960,i need money for tickets


In [6]:
texts = data_train['text']

punctuation = ['!', '.']

def clean_text_with_punctuation(text):
    if not isinstance(text, str):
        raise ValueError(f'Expected input of type str, got {type(text)}')
        
    processed_chars = []
    
    for char in text:
        if char.isalnum() or char.isspace() or char in punctuation:
            processed_chars.append(char)
            
        else: processed_chars.append('')

    cleaned_text = ''.join(processed_chars)
    
    return cleaned_text

def tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    
    return filtered_tokens

stemmer = PorterStemmer()
def stem(tokens):
    stemmed_tokens = []
    for word in tokens:
        stemmed_tokens.append(stemmer.stem(word))
    
    return stemmed_tokens

def preprocess_text(text):
    if not isinstance(text, str):
        raise ValueError(f'Expected input of type str, got {type(text)}')
        
    text       = text.lower()
    clean_text = clean_text_with_punctuation(text)
    
    # print(f'Clean text: {clean_text}')
    
    tokens         = tokenize(clean_text)
    stemmed_tokens = stem(tokens)
    
    final_clean_text = ' '.join(stemmed_tokens)

    return final_clean_text

data_train['text'] = texts.apply(preprocess_text)
data_train.head(5)

Unnamed: 0,text
447515,biglim googl everyon probli jst isnt yet
1119902,stinamfk rememb . think . penni lane ! that sf...
19368,depress . miss home much right !
1468176,andrewjen think someon excit toni ! !
1048930,greggarbo one favorit song . australiagameshol...


### Training the model

In [7]:
X_train = data_train['text']

data_test['text'] = data_test['text'].apply(preprocess_text)

X_test = data_test['text']
y_test = data_test['label']

In [8]:
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

parameter_candidates  = {
    'tfidfvectorizer__max_features': [2000, 2500, 3000],       # maximum number of unique words
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],  # range of n-grams (contiguous sequences of words)
    'multinomialnb__alpha': [0.1, 0.5, 1.0]                    # smoothing parameter in the classifier
}

# find the best hyperparameters
grid_search = GridSearchCV(pipeline, parameter_candidates , n_jobs=-1)
grid_search.fit(X_train, y_train)

best_hyperparams = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparams)

# create the final model
final_model = make_pipeline(
    TfidfVectorizer(max_features=best_hyperparams['tfidfvectorizer__max_features'],
                     ngram_range=best_hyperparams['tfidfvectorizer__ngram_range']),
             MultinomialNB(alpha=best_hyperparams['multinomialnb__alpha'])
)

final_model.fit(X_train, y_train)

Best Hyperparameters: {'multinomialnb__alpha': 1.0, 'tfidfvectorizer__max_features': 3000, 'tfidfvectorizer__ngram_range': (1, 2)}


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=3000, ngram_range=(1, 2))),
                ('multinomialnb', MultinomialNB())])

#### Saving the model

In [9]:
import joblib
joblib.dump(final_model, 'final_model.pkl')

['final_model.pkl']

### Evaluating the model

In [10]:
y_pred   = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report   = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.758225
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.75      0.76     20088
           4       0.75      0.77      0.76     19912

    accuracy                           0.76     40000
   macro avg       0.76      0.76      0.76     40000
weighted avg       0.76      0.76      0.76     40000



In [11]:
# perform cross-validation with 5 folds
scores = cross_val_score(final_model, X_train, y_train, cv=5)

for i, score in enumerate(scores, 1):
    print(f'Fold {i} Score: {score}')

mean_score = scores.mean()
std_dev    = scores.std()
print(f'Mean Cross-Validation Score: {mean_score}')
print(f'Standard Deviation of Scores: {std_dev}')

Fold 1 Score: 0.75475
Fold 2 Score: 0.7561875
Fold 3 Score: 0.754
Fold 4 Score: 0.75578125
Fold 5 Score: 0.75309375
Mean Cross-Validation Score: 0.7547625000000001
Standard Deviation of Scores: 0.0011344395642783204
