In [59]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

In [60]:
def get_dataset_path(filename):
    """
    Constructs the full path to the dataset file.

    Parameters:
    filename (str): The name of the dataset file.

    Returns:
    str: The full path to the dataset file.
    """
    current_dir = os.getcwd()
    dataset_path = os.path.join(current_dir, filename)
    return dataset_path

In [62]:
def load_dataset(path):
    """
    Loads the dataset from the given path.

    Parameters:
    path (str): The path to the dataset file.

    Returns:
    pd.DataFrame: The loaded dataset.
    """
    if not os.path.isfile(path):
        raise FileNotFoundError(f"The dataset file was not found at: {path}")
    
    data = pd.read_csv(path, encoding='latin-1')
    data.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']
    return data


In [64]:
def preprocess_data(data):
    """
    Preprocesses the data for training.

    Parameters:
    data (pd.DataFrame): The dataset.

    Returns:
    tuple: Tuple containing the feature matrix and the target vector.
    """
    X = data['text']
    y = data['polarity']

    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(X)
    return X,y, vectorizer

In [65]:
def train_model(X_train, y_train):
    """
    Trains a Naive Bayes model

    Parameters:
    X_train (sparse matrix): The training feature matrix.
    y_train (pd.Series): The training target vector.

    Returns:
    MultinomialNB: The trained model.
    """

    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

In [66]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the trained model

    Parameters:
    model (MultinomialNB): The trained model.
    X_test (sparse matrix): The test feature matrix.
    y_test (pd.Series): The test target vector.

    Returns:
    None
    """

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Classification Report:\n{report}')
    

In [67]:
def save_model_and_vectorizer(model, vectorizer, model_path='sentiment_model.pkl', vectorizer_path='vectorizer.pkl'):
    """
    Saves the trained model and vectorizer to disk.

    Parameters:
    model (MultinomialNB): The trained model.
    vectorizer (CountVectorizer): The vectorizer.
    model_path (str): The path to save the model.
    vectorizer_path (str): The path to save the vectorizer.

    Returns:
    None
    """
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)




In [68]:
# Define dataset file name
dataset_filename = 'sentiment140.csv'

# Get the path to the dataset
dataset_path = get_dataset_path(dataset_filename)
print(f'Dataset path {dataset_path}')

# Load the dataset
data = load_dataset(dataset_path)
print(f'Dataset loaded. Shape: {data.shape}')
print(data['polarity'].value_counts())

# Preprocess the data
X, y, vectorizer = preprocess_data(data)
print(f'Shape of X afetr vectorizer: {X.shape}')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')

# Train the model
model = train_model(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_test, y_test)

# Save the model and vectorizer
save_model_and_vectorizer(model, vectorizer)

Dataset path c:\Users\najmu\Desktop\text-sentiment-analyzer\text-sentiment-analyzer\python-scripts\sentiment140.csv
Dataset loaded. Shape: (1599999, 6)
polarity
4    800000
0    799999
Name: count, dtype: int64
Shape of X afetr vectorizer: (1599999, 684046)
Shape of X_train: (1279999, 684046)
Shape of y_train: (1279999,)
Accuracy: 76.85%
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.80      0.77    159494
           4       0.79      0.74      0.76    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

