<a href="https://colab.research.google.com/github/marjg/datasets3/blob/main/MGilles_Module_4_Copy_of_NLP_Chapter_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# %matplotlib inline

In [6]:
# df = pd.read_csv("/content/drive/MyDrive/Datasets/SMSSpamCollection.txt", sep ='\t', names=['label', 'message'])
url = "/content/TheSocialDilemma.csv"
df = pd.read_csv(url, encoding='ISO-8859-1')  # or try encoding='latin1'
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,Sentiment
0,Mari Smith,"San Diego, California",Premier Facebook Marketing Expert | Social Med...,2007-09-11 22:22:51,579942,288625,11610,False,2020-09-16 20:55:33,@musicmadmarc @SocialDilemma_ @netflix @Facebo...,,Twitter Web App,False,Neutral
1,Mari Smith,"San Diego, California",Premier Facebook Marketing Expert | Social Med...,2007-09-11 22:22:51,579942,288625,11610,False,2020-09-16 20:53:17,@musicmadmarc @SocialDilemma_ @netflix @Facebo...,,Twitter Web App,False,Neutral
2,Varun Tyagi,"Goa, India",Indian | Tech Solution Artist & Hospitality Ex...,2009-09-06 10:36:01,257,204,475,False,2020-09-16 20:51:57,Go watch âThe Social Dilemmaâ on Netflix!\...,,Twitter for iPhone,False,Positive
3,Casey Conway,"Sydney, New South Wales",Head of Diversity & Inclusion @RugbyAU | It's ...,2012-12-28 21:45:06,11782,1033,12219,True,2020-09-16 20:51:46,I watched #TheSocialDilemma last night. Iâm ...,['TheSocialDilemma'],Twitter for iPhone,False,Negative
4,Charlotte Paul,Darlington,Instagram Charlottejyates,2012-05-28 20:43:08,278,387,5850,False,2020-09-16 20:51:11,The problem of me being on my phone most the t...,['TheSocialDilemma'],Twitter for iPhone,False,Positive


In [7]:
len(df)

20068

In [8]:
df.isnull().sum()

user_name              1
user_location       4208
user_description    1383
user_created           0
user_followers         0
user_friends           0
user_favourites        0
user_verified          0
date                   0
text                   0
hashtags            4297
source                 0
is_retweet             0
Sentiment              0
dtype: int64

In [9]:
# Label
df['Sentiment'].unique()

array(['Neutral', 'Positive', 'Negative'], dtype=object)

In [10]:
import pandas as pd

# Assuming df is your original dataframe
# First, separate the classes
positive_df = df[df['Sentiment'] == 'Positive']
negative_df = df[df['Sentiment'] == 'Negative']

# Downsample the majority class
positive_downsampled = positive_df.sample(n=len(negative_df), random_state=42)

# Combine the minority class with the downsampled majority class
balanced_df = pd.concat([positive_downsampled, negative_df])

# Shuffle the dataframe
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Validate the counts
print(balanced_df['Sentiment'].value_counts())


Negative    3573
Positive    3573
Name: Sentiment, dtype: int64


In [11]:
df = balanced_df
df['Sentiment'].value_counts()

Negative    3573
Positive    3573
Name: Sentiment, dtype: int64

In [12]:
df['Sentiment'].value_counts()

Negative    3573
Positive    3573
Name: Sentiment, dtype: int64

In [13]:
# Filter out the specified sentiment values
df = df[~df['Sentiment'].isin(['Neutral'])]

# Validate the counts
print(df['Sentiment'].value_counts())

Negative    3573
Positive    3573
Name: Sentiment, dtype: int64


In [14]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

# Updated loop using iterrows()
for index, row in df.iterrows():
    review = re.sub('[^a-zA-Z]', ' ', row['text'])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
corpus[:5]

['thesocialdilemma liter put spell us inform terrifi time must watch http co sifbmfc',
 'would recommend everi singl individu go digit wellb screen time page phone http co jbdbizsf g',
 'get social media thesocialdilemma',
 'finish watch social dilemma netflix wow thought provok documentari danger http co puanqk qke',
 'serious contempl delet social media watch thesocialdilemma']

# Split the data into "X" (or features) and y (or labels)

In [16]:
X = df['text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train.shape

(5002,)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
X_train_counts =count_vect.fit_transform(X_train)

print("Shape of count vectorizer", X_train_counts.shape)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf =tfidf_transformer.fit_transform(X_train_counts)

print("Shape of tfidf feature extraction",X_train_tfidf.shape)

Shape of count vectorizer (5002, 11200)
Shape of tfidf feature extraction (5002, 11200)


In [18]:
type(X_train_tfidf)

# Now the data is prepared.  We must add the tf_idf vectorized data into a Machine Learning Model

In [19]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_tfidf, y_train)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),])
text_clf.fit(X_train, y_train)

# Now we test the model with the test data

In [21]:
predictions = text_clf.predict(X_test)

In [22]:
from sklearn import metrics
print("Confusion Metrics\n",metrics.confusion_matrix(y_test,predictions), end="\n\n\n")

print("Classification Report\n",metrics.classification_report(y_test,predictions), end="\n\n\n")

print("Accuracy Score:", metrics.accuracy_score(y_test,predictions))


Confusion Metrics
 [[848 208]
 [142 946]]


Classification Report
               precision    recall  f1-score   support

    Negative       0.86      0.80      0.83      1056
    Positive       0.82      0.87      0.84      1088

    accuracy                           0.84      2144
   macro avg       0.84      0.84      0.84      2144
weighted avg       0.84      0.84      0.84      2144



Accuracy Score: 0.8367537313432836


In [23]:
def predict_sentiment(text):
    # Preprocess the text: remove non-letters, convert to lowercase, split into words
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()

    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)

    # Predict using the trained model
    prediction = text_clf.predict([review])
    return prediction

# Example usage
input_text = "I love New York"  # Replace with your own text
predicted_sentiment = predict_sentiment(input_text)
print("Predicted Sentiment:", predicted_sentiment)


Predicted Sentiment: ['Positive']


In [24]:
def predict_sentiment_probability(text):
    # Preprocess the text: remove non-letters, convert to lowercase, split into words
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()

    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)

    # Predict using the trained model
    probabilities = text_clf.predict_proba([review])
    return probabilities

# Example usage
input_text = "I am neutral about New York"  # Replace with your own text
predicted_probabilities = predict_sentiment_probability(input_text)
print("Predicted Probabilities:", predicted_probabilities)


Predicted Probabilities: [[0.30438171 0.69561829]]


### SVM

In [25]:
from sklearn.svm import SVC
lr_model = SVC(gamma='auto')


SVC_text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SVC(gamma='auto')),])
SVC_text_clf.fit(X_train, y_train)

In [26]:
SVC_predictions = SVC_text_clf.predict(X_test)

In [27]:
from sklearn import metrics
print("Confusion Metrics\n",metrics.confusion_matrix(y_test,SVC_predictions), end="\n\n\n")

print("Classification Report\n",metrics.classification_report(y_test,SVC_predictions), end="\n\n\n")

print("Accuracy Score:", metrics.accuracy_score(y_test,SVC_predictions))

Confusion Metrics
 [[1056    0]
 [1088    0]]


Classification Report
               precision    recall  f1-score   support

    Negative       0.49      1.00      0.66      1056
    Positive       0.00      0.00      0.00      1088

    accuracy                           0.49      2144
   macro avg       0.25      0.50      0.33      2144
weighted avg       0.24      0.49      0.33      2144



Accuracy Score: 0.4925373134328358


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
