In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [25]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
data = {
    "review": [
        "I absolutely love this phone! The camera quality is fantastic and battery lasts all day.",
        "Terrible experience. The screen broke within a week, and customer service was useless.",
        "Great value for money. Performance is smooth and build quality feels premium.",
        "The laptop overheats and the battery drains too fast. Very disappointed.",
        "Excellent sound quality and very comfortable to wear. Highly recommend these headphones!",
        "Worst product I’ve ever bought. It stopped working after two days.",
        "Amazing performance and display. Totally worth the price.",
        "Poor design and fragile build. I regret buying this item.",
        "Fast delivery and great packaging. The product works as expected.",
        "Customer support was unhelpful. Will not buy from this brand again."
    ],
    "sentiment": [
        "positive", "negative", "positive", "negative", "positive",
        "negative", "positive", "negative", "positive", "negative"
    ]
}

In [6]:
df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,I absolutely love this phone! The camera quali...,positive
1,Terrible experience. The screen broke within a...,negative
2,Great value for money. Performance is smooth a...,positive
3,The laptop overheats and the battery drains to...,negative
4,Excellent sound quality and very comfortable t...,positive
5,Worst product I’ve ever bought. It stopped wor...,negative
6,Amazing performance and display. Totally worth...,positive
7,Poor design and fragile build. I regret buying...,negative
8,Fast delivery and great packaging. The product...,positive
9,Customer support was unhelpful. Will not buy f...,negative


In [8]:
#function to lower and clean data

def clean_text(text):
 text=text.lower()
 text=re.sub(r' [^a-zA-Z0-9\s]','',text)
 text=' '.join([word for word in text.split() if word not in stopwords.words('english')])
 return text

In [9]:
#display clean review

df['clean_review']=df['review'].apply(clean_text)
df

Unnamed: 0,review,sentiment,clean_review
0,I absolutely love this phone! The camera quali...,positive,absolutely love phone! camera quality fantasti...
1,Terrible experience. The screen broke within a...,negative,"terrible experience. screen broke within week,..."
2,Great value for money. Performance is smooth a...,positive,great value money. performance smooth build qu...
3,The laptop overheats and the battery drains to...,negative,laptop overheats battery drains fast. disappoi...
4,Excellent sound quality and very comfortable t...,positive,excellent sound quality comfortable wear. high...
5,Worst product I’ve ever bought. It stopped wor...,negative,worst product i’ve ever bought. stopped workin...
6,Amazing performance and display. Totally worth...,positive,amazing performance display. totally worth price.
7,Poor design and fragile build. I regret buying...,negative,poor design fragile build. regret buying item.
8,Fast delivery and great packaging. The product...,positive,fast delivery great packaging. product works e...
9,Customer support was unhelpful. Will not buy f...,negative,customer support unhelpful. buy brand again.


In [12]:
cv=CountVectorizer (max_features=20)
X=cv.fit_transform (df['clean_review']).toarray()
y=df['sentiment']
cv.get_feature_names_out()

array(['again', 'amazing', 'battery', 'bought', 'broke', 'build', 'buy',
       'buying', 'camera', 'comfortable', 'customer', 'days', 'delivery',
       'design', 'disappointed', 'fast', 'great', 'performance',
       'product', 'quality'], dtype=object)

In [13]:
bow_df=pd.DataFrame (X, columns=cv.get_feature_names_out())
bow_df

Unnamed: 0,again,amazing,battery,bought,broke,build,buy,buying,camera,comfortable,customer,days,delivery,design,disappointed,fast,great,performance,product,quality
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
5,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0
9,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0


In [14]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
model = LogisticRegression()
model.fit(x_train,y_train)

In [23]:
#User Prediction
user_review=input("Enter your product review: ")
user_review=clean_text(user_review)
user_review_bow=cv.transform([user_review]).toarray()
user_sentiment = model.predict(user_review_bow)
print("Predicted Sentiment:", user_sentiment[0])

Enter your product review: i think it is smelly like a poop
Predicted Sentiment: negative


Practise one

input_text="Natural Language Processing is going to help you out in near future . You have 2 remember this!! for getting interview calls"

**Do the following Tasks:**


1.   Splits text into words/tokens (show the individual word token)
2.   Clean Tokens (show clean token)
3. Remove Stop words (Show token after removing stop words)
4. Stemming (Show the words in their root forms)
5. Lemmatization (Show lemma words)

In [26]:
input_text="Natural Language Processing is going to help you out in near future . You have 2 remember this!! for getting interview calls"

# 1. Splits text into words/tokens (show the individual word token)

tokens=nltk.word_tokenize(input_text)
tokens

['Natural',
 'Language',
 'Processing',
 'is',
 'going',
 'to',
 'help',
 'you',
 'out',
 'in',
 'near',
 'future',
 '.',
 'You',
 'have',
 '2',
 'remember',
 'this',
 '!',
 '!',
 'for',
 'getting',
 'interview',
 'calls']

In [27]:
# 2. Clean Tokens (show clean token)

clean_tokens=[re.sub('[^a-zA-Z]','', token.lower()) for token in tokens if token.isalpha()]
clean_tokens

['natural',
 'language',
 'processing',
 'is',
 'going',
 'to',
 'help',
 'you',
 'out',
 'in',
 'near',
 'future',
 'you',
 'have',
 'remember',
 'this',
 'for',
 'getting',
 'interview',
 'calls']

In [28]:
# 3. Remove Stop words (Show token after removing stop words)

stop_words=set(stopwords.words('english'))
filtered_tokens = [token for token in clean_tokens if token not in stop_words]
filtered_tokens

['natural',
 'language',
 'processing',
 'going',
 'help',
 'near',
 'future',
 'remember',
 'getting',
 'interview',
 'calls']

In [31]:
# 4. Stemming (Show the words in their root forms)

stemmer = PorterStemmer()
stemmed_tokens=[stemmer.stem(token) for token in filtered_tokens]
stemmed_tokens

['natur',
 'languag',
 'process',
 'go',
 'help',
 'near',
 'futur',
 'rememb',
 'get',
 'interview',
 'call']

In [34]:
# 5. Lemmatization (Show lemma words)

lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

['natural',
 'language',
 'processing',
 'going',
 'help',
 'near',
 'future',
 'remember',
 'getting',
 'interview',
 'call']