In [7]:
import pandas as pd
import os

# Define the CSV files to load
csv_files = ['happiness.csv', 'angriness.csv', 'sadness.csv']

# Load and concatenate all CSV files into a single DataFrame
data_frames = [pd.read_csv(file) for file in csv_files]
data = pd.concat(data_frames, ignore_index=True)

# Display the first few rows of the concatenated DataFrame
print(data.head())



                                             content  intensity
0  Wants to know how the hell I can remember word...  happiness
1  Love is a long sweet dream & marriage is an al...  happiness
2  The world could be amazing when you are slight...  happiness
3  My secret talent is getting tired without doin...  happiness
4  Khatarnaak Whatsapp Status Ever… Can\’t talk, ...  happiness


In [6]:
print(data.columns)


Index(['content', 'intensity'], dtype='object')


In [5]:
import pandas as pd
import os
import numpy as np

# Define the directory where the CSV files are stored
data_dir = 'Intensity_data'

# Define the CSV files to load (include the directory path)
csv_files = [os.path.join('happiness.csv'), os.path.join('angriness.csv'), os.path.join('sadness.csv')]

# Load and concatenate all CSV files into a single DataFrame
data_frames = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        data_frames.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

if len(data_frames) == 0:
    raise ValueError("No data loaded. Check the file paths and formats.")

data = pd.concat(data_frames, ignore_index=True)

# Display the first few rows of the concatenated DataFrame
print(data.head())


                                             content  intensity
0  Wants to know how the hell I can remember word...  happiness
1  Love is a long sweet dream & marriage is an al...  happiness
2  The world could be amazing when you are slight...  happiness
3  My secret talent is getting tired without doin...  happiness
4  Khatarnaak Whatsapp Status Ever… Can\’t talk, ...  happiness


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

# Function to clean the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', ' ', text)
    text = text.lower()
    return text

# Ensure the clean_text function is defined before this step
# Apply text cleaning to the 'content' column
data['cleaned_text'] = data['content'].apply(clean_text)

# Encode the labels in the 'intensity' column
label_encoder = LabelEncoder()
data['intensity_label'] = label_encoder.fit_transform(data['intensity'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['intensity_label'], test_size=0.2, random_state=42)

# Save label encoder for future use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


In [13]:
# Feature Engineering :

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Save vectorizer for future use
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)


In [16]:
#  Model Selection and Training :
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

# Save the trained model for future use
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)


In [15]:
#  Model Evaluation :
# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Display classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Model Accuracy: 73.77%
              precision    recall  f1-score   support

   angriness       0.76      0.78      0.77       133
   happiness       0.76      0.72      0.74       156
     sadness       0.68      0.71      0.69       119

    accuracy                           0.74       408
   macro avg       0.74      0.74      0.74       408
weighted avg       0.74      0.74      0.74       408



In [17]:
#  Hyperparameter Tuning (Optional) :
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_tfidf, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Use the best model
best_model = grid_search.best_estimator_

# Save the best model
with open('best_model.pkl', 'wb') as best_model_file:
    pickle.dump(best_model, best_model_file)


Best Parameters: {'C': 100, 'solver': 'liblinear'}


In [26]:
import pickle

# Load the saved model
with open('best_model.pkl', 'rb') as best_model_file:
    best_model = pickle.load(best_model_file)


In [35]:
import pickle

# Load the saved TfidfVectorizer
with open('vectorizer.pkl', 'rb') as vec_file:
    vectorizer = pickle.load(vec_file)


In [38]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the saved TfidfVectorizer
with open('vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

# Step 2: Load the saved LogisticRegression model
with open('best_model.pkl', 'rb') as model_file:
    best_model = pickle.load(model_file)

# Step 3: Your new text data
new_data = ["This is a sample text that I want to classify."]

# Step 4: Transform the new text data using the vectorizer
new_data_tfidf = vectorizer.transform(new_data)

# Step 5: Use the loaded model to make a prediction
predictions = best_model.predict(new_data_tfidf)

# Step 6: Print the prediction
print("Predicted class:", predictions[0])


Predicted class: 1
