<a href="https://colab.research.google.com/github/masher5764/NLP__Sentiment_Analysis-Reddit_comments/blob/main/NLP__Sentiment_Analysis_Reddit_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Extraction

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [144]:
data = pd.read_csv('/content/reddit-comments.csv')

In [145]:
data[:2]

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
0,comment,hyyz6g8,2r97t,datasets,False,1646173750,https://old.reddit.com/r/datasets/comments/t45...,Spatial problem: Suitability of new locations ...,0.0772,1
1,comment,hyyid7v,2r97t,datasets,False,1646167280,https://old.reddit.com/r/datasets/comments/sg9...,Have you tried toying around with GDELT or Ali...,0.0,2


In [146]:
data.isnull().sum()

Unnamed: 0,0
type,0
id,0
subreddit.id,0
subreddit.name,0
subreddit.nsfw,0
created_utc,0
permalink,0
body,2
sentiment,7478
score,0


In [147]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54848 entries, 0 to 54847
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            54848 non-null  object 
 1   id              54848 non-null  object 
 2   subreddit.id    54848 non-null  object 
 3   subreddit.name  54848 non-null  object 
 4   subreddit.nsfw  54848 non-null  bool   
 5   created_utc     54848 non-null  int64  
 6   permalink       54848 non-null  object 
 7   body            54846 non-null  object 
 8   sentiment       47370 non-null  float64
 9   score           54848 non-null  int64  
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 3.8+ MB


In [148]:
data['subreddit.nsfw'].value_counts()

Unnamed: 0_level_0,count
subreddit.nsfw,Unnamed: 1_level_1
False,54848


In [149]:
data['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
comment,54848


In [150]:
columns_to_drop = [
    'type',
    'id',
    'subreddit.id',
    'subreddit.name',
    'subreddit.nsfw',
    'created_utc',
    'permalink'
]
data = data.drop(columns=columns_to_drop)

In [151]:
data.head()

Unnamed: 0,body,sentiment,score
0,Spatial problem: Suitability of new locations ...,0.0772,1
1,Have you tried toying around with GDELT or Ali...,0.0,2
2,Damn random internet person of whom I know not...,-0.3851,3
3,Ah nice one. Best of luck with the baby. If yo...,0.9136,3
4,I was about to write and say this shouldn't be...,0.0762,2


In [152]:
data.head()

Unnamed: 0,body,sentiment,score
0,Spatial problem: Suitability of new locations ...,0.0772,1
1,Have you tried toying around with GDELT or Ali...,0.0,2
2,Damn random internet person of whom I know not...,-0.3851,3
3,Ah nice one. Best of luck with the baby. If yo...,0.9136,3
4,I was about to write and say this shouldn't be...,0.0762,2


# Sentiment preprocessing

In [153]:
def label_sentiment(score):
    if score > 0.2:
        return 'positive'
    elif score < -0.2:
        return 'negative'
    else:
        return 'neutral'

In [154]:
data['sentiment'] = data['sentiment'].apply(label_sentiment)
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,27190
neutral,22271
negative,5387


In [155]:
from sklearn.utils import resample

data_majority_positive = data[data['sentiment'] == 'positive']
data_majority_neutral = data[data['sentiment'] == 'neutral']
data_minority = data[data['sentiment'] == 'negative']

In [156]:
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority_neutral), random_state=42)
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority_positive), random_state=42)

In [157]:
data_upsampled = pd.concat([data_majority_positive, data_majority_neutral, data_minority_upsampled])

In [158]:
x = data_upsampled['body']
y = data_upsampled['sentiment']

In [159]:
y.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,27190
negative,27190
neutral,22271


In [160]:
sentiment_map = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}

y = y.map(sentiment_map)


In [161]:
y.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,27190
-1,27190
0,22271


# Body Preprocessing

In [162]:
#lower case
x = x.str.lower()

In [163]:
#special character removed
import re
x = x.apply(lambda text: re.sub(r'[^A-Za-z0-9\s]', '', str(text)))

In [164]:
!pip install nltk



In [165]:
#stopwards remove
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

x = x.apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in stop_words]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [166]:
# remove url
x = x.apply(lambda text: re.sub(r'http\S+|www\S+', '', str(text)))

In [167]:
# remove html tags
from bs4 import BeautifulSoup
x = x.apply(lambda text: BeautifulSoup(str(text), "lxml").get_text())

In [168]:
# remove additional spaces
x = x.apply(lambda text: re.sub(r'\s+', ' ', str(text)).strip())

In [169]:
x.head()

Unnamed: 0,body
3,ah nice one best luck baby looking internship ...
5,im exactly sure many contracts enron data set ...
7,another hopeful case completing masters data s...
9,gtvery interesting idea could concept modified...
10,nice one thanks fit nicely assignment regardin...


In [170]:
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Lemmatize all words as VERBs
x = x.apply(lambda text: ' '.join([lemmatizer.lemmatize(word, pos='v') for word in str(text).split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [171]:
x.head(2)

Unnamed: 0,body
3,ah nice one best luck baby look internship any...
5,im exactly sure many contract enron data set a...


# Text NLP

In [172]:
corpus = []
corpus.extend(x.to_list())

In [173]:
corpus[:2]

['ah nice one best luck baby look internship anything message case know anyone area interest',
 'im exactly sure many contract enron data set attachments worth look since im really sure kind contract want could scrape end user license agreements internet term service']

In [174]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [176]:
!pip install gensim



In [177]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [178]:
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for token in sent_token:
    words.append(simple_preprocess(sent))

In [179]:
words[:2]

[['ah',
  'nice',
  'one',
  'best',
  'luck',
  'baby',
  'look',
  'internship',
  'anything',
  'message',
  'case',
  'know',
  'anyone',
  'area',
  'interest'],
 ['im',
  'exactly',
  'sure',
  'many',
  'contract',
  'enron',
  'data',
  'set',
  'attachments',
  'worth',
  'look',
  'since',
  'im',
  'really',
  'sure',
  'kind',
  'contract',
  'want',
  'could',
  'scrape',
  'end',
  'user',
  'license',
  'agreements',
  'internet',
  'term',
  'service']]

# Building a Word2Vec Model

In [180]:
import gensim

In [181]:
model = gensim.models.Word2Vec(words)

In [182]:
model.corpus_count

75686

In [183]:
model.wv.similar_by_word('good')

[('great', 0.717322051525116),
 ('decent', 0.6725327372550964),
 ('sa', 0.6384615898132324),
 ('nice', 0.618061900138855),
 ('best', 0.5880757570266724),
 ('cool', 0.5800597071647644),
 ('easy', 0.5696188807487488),
 ('neat', 0.562980592250824),
 ('bad', 0.5586414337158203),
 ('better', 0.5398711562156677)]

In [184]:
model.wv['good'].shape

(100,)

In [185]:
words[0]

['ah',
 'nice',
 'one',
 'best',
 'luck',
 'baby',
 'look',
 'internship',
 'anything',
 'message',
 'case',
 'know',
 'anyone',
 'area',
 'interest']

In [186]:
def average_word2vec(doc):
  #remove out of vocabulary words
  #sent = [word for word in doc if word in model.wv.index_to_key]
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [187]:
!pip install tqdm



In [188]:
from tqdm import tqdm

In [189]:
vectors = []
for i in tqdm(range(len(words))):
    vectors.append(average_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 75686/75686 [01:54<00:00, 662.77it/s]


In [191]:
x.shape

(76651,)

In [192]:
len(vectors)

75686

In [193]:
vectors[0].shape

(100,)

In [194]:
y.shape

(76651,)

Dimension fixing

In [195]:
y_changed = y[list(map(lambda x : len(x) > 0 , corpus))].reset_index(drop=True)

# Now wrap and encode
y_changed = pd.DataFrame({'sentiment': y_changed})
y_changed = y_changed.iloc[:, 0].values

print("y_changed.shape:", y_changed.shape)


y_changed.shape: (75686,)


In [196]:
y_changed.shape

(75686,)

In [197]:
y_changed[:4]

array([1, 1, 1, 1])

# Preprocessing the data:

In [198]:
df_vectors_x = pd.DataFrame()
for i in range(0, len(vectors)):
  df_vectors_x = pd.concat([df_vectors_x, pd.DataFrame(vectors[i].reshape(1, -1))], ignore_index=True)

  df_vectors_x = pd.concat([df_vectors_x, pd.DataFrame(vectors[i].reshape(1, -1))], ignore_index=True)


In [200]:
df_vectors_x.shape

(75686, 100)

In [201]:
df_vectors_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.198676,0.212165,0.498542,-0.493497,-0.448473,-0.212261,0.16295,0.072062,-0.289549,-0.375928,...,1.073122,-0.306494,-0.118506,-0.340723,0.329164,0.290323,-0.398113,-0.579057,-0.169197,-0.190387
1,0.111617,-0.02119,0.582174,-0.036088,0.162123,-0.071186,0.290551,0.137095,-0.123389,-0.425393,...,0.395957,-0.050537,-0.122113,-0.561382,-0.172445,0.264932,0.310332,-0.504736,-0.042799,-0.198907
2,-0.122853,0.604906,0.673262,-0.572317,0.053416,-0.447343,-0.032668,0.152988,-0.002455,-0.342889,...,0.331928,-0.053936,-0.243202,-0.306839,0.126608,0.284506,-0.021018,-0.572953,-0.460553,-0.161267
3,-0.418571,0.498022,0.105063,-0.036145,-0.188407,-0.174211,-0.030625,0.329951,-0.251516,-0.272507,...,0.325651,-0.452643,0.295972,-0.336683,0.128027,0.097901,0.362289,-0.574914,0.215531,-0.109092
4,-0.262287,0.224919,-0.151304,0.249186,-0.524568,-0.310899,-0.139131,0.165872,-0.330939,-0.165292,...,0.694007,0.193151,-0.124432,0.00492,0.374876,0.106204,-0.215059,-0.172786,0.071516,-0.399676


In [202]:
df = df_vectors_x.copy()
df['output'] = y_changed

In [203]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,output
0,-0.198676,0.212165,0.498542,-0.493497,-0.448473,-0.212261,0.16295,0.072062,-0.289549,-0.375928,...,-0.306494,-0.118506,-0.340723,0.329164,0.290323,-0.398113,-0.579057,-0.169197,-0.190387,1
1,0.111617,-0.02119,0.582174,-0.036088,0.162123,-0.071186,0.290551,0.137095,-0.123389,-0.425393,...,-0.050537,-0.122113,-0.561382,-0.172445,0.264932,0.310332,-0.504736,-0.042799,-0.198907,1
2,-0.122853,0.604906,0.673262,-0.572317,0.053416,-0.447343,-0.032668,0.152988,-0.002455,-0.342889,...,-0.053936,-0.243202,-0.306839,0.126608,0.284506,-0.021018,-0.572953,-0.460553,-0.161267,1
3,-0.418571,0.498022,0.105063,-0.036145,-0.188407,-0.174211,-0.030625,0.329951,-0.251516,-0.272507,...,-0.452643,0.295972,-0.336683,0.128027,0.097901,0.362289,-0.574914,0.215531,-0.109092,1
4,-0.262287,0.224919,-0.151304,0.249186,-0.524568,-0.310899,-0.139131,0.165872,-0.330939,-0.165292,...,0.193151,-0.124432,0.00492,0.374876,0.106204,-0.215059,-0.172786,0.071516,-0.399676,1


In [204]:
df.isnull().sum()

Unnamed: 0,0
0,373
1,373
2,373
3,373
4,373
...,...
96,373
97,373
98,373
99,373


In [205]:
df.dropna(axis = 0, inplace=True)

In [206]:
df.isnull().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
96,0
97,0
98,0
99,0


In [207]:
# Independent Features
x = df.iloc[:, :-1]

# Dependent Features
y = df['output']

In [208]:
x.shape

(75313, 100)

In [209]:
y.shape

(75313,)

# Training

In [210]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [211]:
y_train

Unnamed: 0,output
31448,0
21504,1
23921,1
4614,1
27252,0
...,...
37384,0
6266,1
55259,-1
860,1


In [213]:
from xgboost import XGBClassifier
model_XBR = XGBClassifier()

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

In [215]:
from sklearn.model_selection import RandomizedSearchCV
rand_search_XGB = RandomizedSearchCV(estimator=model_XBR, param_distributions=param_grid, cv=5, n_jobs=-1, verbose=2)

In [216]:
y_train = y_train + 1

In [218]:
rand_search_XGB.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [220]:
rand_search_XGB.best_params_

{'subsample': 1.0,
 'reg_lambda': 0.2,
 'reg_alpha': 0.1,
 'n_estimators': 300,
 'min_child_weight': 1,
 'max_depth': 5,
 'learning_rate': 0.2,
 'gamma': 0,
 'colsample_bytree': 0.9}

In [221]:
rand_search_XGB.best_score_

0.8249958506224067

In [222]:
y_pred = rand_search_XGB.predict(x_test)

In [223]:
y_test = y_test + 1

In [224]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.8258647015866694
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      5413
           1       0.85      0.71      0.77      4269
           2       0.82      0.79      0.80      5381

    accuracy                           0.83     15063
   macro avg       0.83      0.82      0.82     15063
weighted avg       0.83      0.83      0.82     15063



# Prediction System

In [229]:
import joblib
import gensim
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load models - Replace these paths with your actual model paths
word2vec_model = model
xgb_model = rand_search_XGB.best_estimator_

# Preprocessing setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = BeautifulSoup(text, "lxml").get_text()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = [w for w in text.split() if w not in stop_words]
    lemmatized = [lemmatizer.lemmatize(w, pos='v') for w in words]
    return lemmatized

def get_vector(words):
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if not vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

def predict_sentiment(text):
    tokens = preprocess_text(text)
    vector = get_vector(tokens).reshape(1, -1)
    prediction = xgb_model.predict(vector)[0]
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map.get(prediction, "Unknown")

# Simple interaction
while True:
    user_input = input("Enter a comment (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exited.")
        break
    sentiment = predict_sentiment(user_input)
    print(f"Predicted Sentiment: {sentiment}\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Enter a comment (or type 'exit' to quit): This is exactly what I needed today. Thank you for sharing!
Predicted Sentiment: Positive

Enter a comment (or type 'exit' to quit): Honestly, this movie was way better than I expected. Great pacing and solid acting
Predicted Sentiment: Positive

Enter a comment (or type 'exit' to quit): This is a terrible take. You clearly didn’t do your research
Predicted Sentiment: Positive

Enter a comment (or type 'exit' to quit): The game is full of bugs and barely playable. Huge disappointment.
Predicted Sentiment: Negative

Enter a comment (or type 'exit' to quit): I regret wasting my time on this. Zero payoff
Predicted Sentiment: Neutral

Enter a comment (or type 'exit' to quit): I read somewhere that it’s getting an update soon. Might be worth waiting
Predicted Sentiment: Positive

Enter a comment (or type 'exit' to quit): I’ve been using this for months now and it works like a charm.
Predicted Sentiment: Positive

Enter a comment (or type 'exit' to q