In [7]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet')
nltk.download('stopwords') 
print("Loading JSON data...")
with open('feedback_data.json', 'r') as f:
    data = json.load(f)


df = pd.DataFrame(data)
print("Data loaded successfully.\n")

print("Here are the first few rows of the DataFrame:")
print(df.head())

print("\nChecking for missing values...")
print(df.isnull().sum())

print("\nDistribution of sentiment labels:")
print(df['sentiment'].value_counts())

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text
print("\nApplying data cleaning steps...")
df['feedback'] = df['feedback'].apply(clean_text)
print("Data cleaning completed.\n")

print("\nInitializing TF-IDF Vectorizer...")
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

print("Vectorizing cleaned feedback text data...")
X = vectorizer.fit_transform(df['feedback'])
print("Feedback text vectorized.")
print(f"Shape of the tokenized data: {X.shape}\n")

y = df['sentiment']
print(f"Target variable (y) values:\n{y.value_counts()}\n")

df_majority = df[df.sentiment == 'positive']
df_minority = df[df.sentiment == 'negative']

df_minority_upsampled = resample(df_minority, 
                                  replace=True,     
                                  n_samples=len(df_majority),   
                                  random_state=42) 

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

print("\nNew distribution of sentiment labels after upsampling:")
print(df_upsampled['sentiment'].value_counts())

X = vectorizer.transform(df_upsampled['feedback'])
y = df_upsampled['sentiment']


print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}\n")

print("Initializing the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)

params = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid=params, cv=5)
print("Training the model with Grid Search...")
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Model training completed.\n")

print("Making predictions on the test set...")
y_pred = best_model.predict(X_test)
print("Predictions completed.\n")

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Saving the model as 'sentiment_model.pkl'...")
joblib.dump(best_model, 'sentiment_model.pkl')
print("Model saved.\n")

print("Saving the vectorizer as 'vectorizer.pkl'...")
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Vectorizer saved.\n")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gaurisarin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gaurisarin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading JSON data...
Data loaded successfully.

Here are the first few rows of the DataFrame:
                                            feedback sentiment
0      This company truly cares about its employees.  positive
1                  I love the work environment here.  positive
2  I feel like the company doesn’t care about emp...  negative
3                  I love the work environment here.  positive
4                  I love the work environment here.  positive

Checking for missing values...
feedback     0
sentiment    0
dtype: int64

Distribution of sentiment labels:
sentiment
positive    1500
negative    1500
Name: count, dtype: int64

Applying data cleaning steps...
Data cleaning completed.


Initializing TF-IDF Vectorizer...
Vectorizing cleaned feedback text data...
Feedback text vectorized.
Shape of the tokenized data: (3000, 204)

Target variable (y) values:
sentiment
positive    1500
negative    1500
Name: count, dtype: int64


New distribution of sentiment labels after u