<a href="https://colab.research.google.com/github/muniprasadreddy/my-colab_work/blob/main/Amazon__unlocked_mobile__reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Problem Statement:
        our goal is to perform sentiment analysis on customer reviews of client's products on Amazon. The objective is to build a predictive model that can classify reviews into positive, negative, or neutral sentiments based on the expressed opinions. The analysis aims to understand the overall sentiment of customers towards the products and provide insights for product improvement, marketing strategies, and customer satisfaction."


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
%matplotlib inline
import re
import random
plt.style.use("ggplot")


In [None]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
english_stemmer=SnowballStemmer('english')

In [None]:
data=pd.read_csv('/content/Amazon_Unlocked_Mobile.csv.zip')

In [None]:
print("Initial Memory Usage:")
print("Memory used in MB" ,data.memory_usage(deep=True).sum()/(1024*1024))
pre=data.memory_usage(deep=True).sum()/(1024*1024)
# Step 3: Identify columns for optimization
# For example, let's say we want to convert all int columns to int32 and all float columns to float32
int_columns = data.select_dtypes(include='int').columns
float_columns = data.select_dtypes(include='float').columns
for col in int_columns:
    c_min=data[col].min()
    c_max=data[col].max()
    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
              data[col] = data[col].astype(np.int8)
    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
        data[col] = data[col].astype(np.int16)
    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
        data[col] = data[col].astype(np.int32)
    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
        data[col] = data[col].astype(np.int64)
for col in float_columns:
  c_min=data[col].min()
  c_max=data[col].max()
  if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                data[col] = data[col].astype(np.float16)
  elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
      data[col] = data[col].astype(np.float32)
  else:
      data[col] = data[col].astype(np.float64)
print("Optimized Memory:")
print("Memory used in MB" ,data.memory_usage(deep=True).sum()/(1024*1024))
post=data.memory_usage(deep=True).sum()/(1024*1024)
print("++++++++++++++++++++++++")
print("lift in memory",((post-pre)/pre)*100)

Initial Memory Usage:
Memory used in MB 198.3385648727417
Optimized Memory:
Memory used in MB 190.839861869812
++++++++++++++++++++++++
lift in memory -3.780758929934285


In [None]:
data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,200.0,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,200.0,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,200.0,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,200.0,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,200.0,4,Great phone to replace my lost phone. The only...,0.0


In [None]:
data.tail()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
413835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.9375,5,another great deal great price,0.0
413836,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.9375,3,Ok,0.0
413837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.9375,5,Passes every drop test onto porcelain tile!,0.0
413838,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.9375,3,I returned it because it did not meet my needs...,0.0
413839,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.9375,4,Only downside is that apparently Verizon no lo...,0.0


In [None]:
data.shape

(413840, 6)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Product Name  413840 non-null  object 
 1   Brand Name    348669 non-null  object 
 2   Price         407907 non-null  float16
 3   Rating        413840 non-null  int8   
 4   Reviews       413778 non-null  object 
 5   Review Votes  401544 non-null  float16
dtypes: float16(2), int8(1), object(3)
memory usage: 11.4+ MB


### Product Name
The name or title of the mobile phone product.

### Brand Name
The brand or manufacturer of the mobile phone.

### Price
The price of the mobile phone, indicating the cost to the customer.

### Rating
The numerical rating assigned to the mobile phone by customers. Typically, ratings range from 1 to 5, where higher values represent better satisfaction.

### Reviews
Textual feedback or reviews provided by customers who have purchased and used the mobile phone. These reviews may include positive, negative, or neutral sentiments.

### Review Votes
The number of votes or helpfulness indicators received for a particular review. Customers may vote on the usefulness of reviews, and this column represents the aggregated count of such votes.


In [None]:
data.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [None]:
data.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [None]:
data.describe()

Unnamed: 0,Price,Rating,Review Votes
count,407907.0,413840.0,401544.0
mean,,3.819578,
std,,1.548216,
min,1.730469,1.0,0.0
25%,80.0,3.0,0.0
50%,144.75,5.0,0.0
75%,270.0,5.0,1.0
max,2598.0,5.0,645.0


In [None]:
data=data[['Rating','Reviews']]

In [None]:
data

Unnamed: 0,Rating,Reviews
0,5,I feel so LUCKY to have found this used (phone...
1,4,"nice phone, nice up grade from my pantach revu..."
2,5,Very pleased
3,4,It works good but it goes slow sometimes but i...
4,4,Great phone to replace my lost phone. The only...
...,...,...
413835,5,another great deal great price
413836,3,Ok
413837,5,Passes every drop test onto porcelain tile!
413838,3,I returned it because it did not meet my needs...


In [None]:
data.isna().any()

Rating     False
Reviews     True
dtype: bool

In [None]:
data.isna().values.sum()

62

In [None]:
data = data[data['Reviews'].isnull()==False]

In [None]:
#data=data.dropna()
data.shape

(413778, 2)

In [None]:
data['Rating'].value_counts()

5    223580
1     72337
4     61374
3     31763
2     24724
Name: Rating, dtype: int64

### 1. Positive Reviews Count
A count of reviews with a positive sentiment, based on the User Rating.

### 2. Negative Reviews Count
A count of reviews with a negative sentiment, based on the User Rating.

### 3. Average Review Length
The average length of the reviews, calculated using the number of words or characters.

### 4. Review Helpfulness Ratio
The ratio of Review Votes to the total number of Reviews, indicating the average helpfulness of reviews.

### 5. User Rating Weighted by Review Votes
A weighted average of User Rating, where higher Review Votes contribute more to the overall rating.

### 6. Review Sentiment
A categorical column indicating the sentiment of reviews (positive, negative, neutral) based on predefined thresholds.

### 7. Review Polarity
The polarity of reviews, indicating whether the sentiment is predominantly positive or negative.

### 8. User Rating Deviation
The difference between User Rating and the overall average User Rating, highlighting deviations from the average.

### 9. Review Length Category
Categorizing reviews into short, medium, or long based on their length.

### 10. Helpfulness Percentage
The percentage of Review Votes in comparison to the total number of Reviews, providing a measure of community engagement.



In [None]:
import pandas as pd
from textblob import TextBlob  # For sentiment analysis

# Assuming 'data' is your DataFrame containing 'User Rating' and 'Reviews'

# New Column 1: Length of Reviews
data['Review Length'] = data['Reviews'].apply(lambda x: len(x))

# New Column 2: Number of Words in Reviews
data['Word Count'] = data['Reviews'].apply(lambda x: len(x.split()))

# New Column 3: Average Word Length in Reviews
data['Avg Word Length'] = data['Reviews'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# New Column 4: Sentiment Polarity of Reviews
data['Sentiment Polarity'] = data['Reviews'].apply(lambda x: TextBlob(x).sentiment.polarity)

# New Column 5: Sentiment Subjectivity of Reviews
data['Sentiment Subjectivity'] = data['Reviews'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# New Column 6: Number of Positive Reviews (Assuming sentiment polarity > 0.1 is positive)
data['Positive Reviews'] = data['Sentiment Polarity'].apply(lambda x: 1 if x > 0.1 else 0)

# New Column 7: Number of Negative Reviews (Assuming sentiment polarity < -0.1 is negative)
data['Negative Reviews'] = data['Sentiment Polarity'].apply(lambda x: 1 if x < -0.1 else 0)

# New Column 8: Number of Neutral Reviews (Assuming sentiment polarity between -0.1 and 0.1 is neutral)
data['Neutral Reviews'] = data['Sentiment Polarity'].apply(lambda x: 1 if -0.1 <= x <= 0.1 else 0)

# New Column 9: Ratio of Positive to Negative Reviews
data['Pos/Neg Ratio'] = data['Positive Reviews'] / data['Negative Reviews'].replace(0, 1)

# New Column 10: Ratio of Total Reviews to Review Length
data['Review/Length Ratio'] = data['Review Length'] / data['Word Count'].replace(0, 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Review Length'] = data['Reviews'].apply(lambda x: len(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Word Count'] = data['Reviews'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Avg Word Length'] = data['Reviews'].apply(lambda x: sum(len(word) 

In [None]:
data.head()

Unnamed: 0,Rating,Reviews,Review Length,Word Count,Avg Word Length,Sentiment Polarity,Sentiment Subjectivity,Positive Reviews,Negative Reviews,Neutral Reviews,Pos/Neg Ratio,Review/Length Ratio
0,5,I feel so LUCKY to have found this used (phone...,374,72,4.208333,0.180952,0.611,1,0,0,1.0,5.194444
1,4,"nice phone, nice up grade from my pantach revu...",214,40,4.375,0.449259,0.762222,1,0,0,1.0,5.35
2,5,Very pleased,12,2,5.5,0.65,1.0,1,0,0,1.0,6.0
3,4,It works good but it goes slow sometimes but i...,76,17,3.529412,0.4525,0.595,1,0,0,1.0,4.470588
4,4,Great phone to replace my lost phone. The only...,223,43,4.209302,0.214583,0.708333,1,0,0,1.0,5.186047


In [None]:
# print (data[data['Rating']==5].count())
# print (data[data['Rating']<5].count())


In [None]:
data.Reviews[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [None]:
###preprocesing steps

In [None]:
### REmove HTML
review_text= BeautifulSoup(data.Reviews[0]).get_text()
print (review_text)

I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


In [None]:
## Remove non_letters( Numbers and Special Characters)

review_text = re.sub("[^a-zA-Z]"," ", review_text)
print (review_text)

I feel so LUCKY to have found this used  phone to us   not used hard at all   phone on line from someone who upgraded and sold this one  My Son liked his old one that finally fell apart after      years and didn t want an upgrade   Thank you Seller  we really appreciate it   your honesty re  said used phone I recommend this seller very highly   would but from them again  


In [None]:
## Convert words to lower case and split them
words = review_text.lower().split()
print (words)

['i', 'feel', 'so', 'lucky', 'to', 'have', 'found', 'this', 'used', 'phone', 'to', 'us', 'not', 'used', 'hard', 'at', 'all', 'phone', 'on', 'line', 'from', 'someone', 'who', 'upgraded', 'and', 'sold', 'this', 'one', 'my', 'son', 'liked', 'his', 'old', 'one', 'that', 'finally', 'fell', 'apart', 'after', 'years', 'and', 'didn', 't', 'want', 'an', 'upgrade', 'thank', 'you', 'seller', 'we', 'really', 'appreciate', 'it', 'your', 'honesty', 're', 'said', 'used', 'phone', 'i', 'recommend', 'this', 'seller', 'very', 'highly', 'would', 'but', 'from', 'them', 'again']


In [None]:
 ## Remove Stopwords
import nltk
nltk.download('stopwords')
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]

print (stops)
print (" \nwords after removing stopwords\n")
print (words)

{"wasn't", 'just', 'i', 'by', "couldn't", 'why', 'couldn', "hasn't", 'hasn', 'ours', 'than', 'once', 'were', "shouldn't", 'in', 'hadn', "you'd", 'under', 'no', 'her', 'have', 'there', 'most', "hadn't", "wouldn't", 'so', 'more', 'aren', 'when', 'didn', 'if', 'very', "mightn't", 'mustn', 'ourselves', 'too', 'but', 'until', 'can', 'this', 'how', 'at', 'd', 'other', 'shan', 'it', 'all', 'nor', 'below', 'out', 'myself', 'about', 'been', 'any', "should've", 'our', "it's", 'does', 'me', 'that', 'my', 'o', 'itself', 'shouldn', 'off', 'both', 'above', 'as', 'll', 've', 'and', 'should', 'himself', 'being', 'to', 'during', "mustn't", 'whom', 's', 'won', 'further', "weren't", 'having', 'such', 'ma', 'we', "she's", 'they', 'am', 'same', 'has', 'or', 'some', 'do', 'before', 'again', 'from', 'over', "doesn't", "needn't", "won't", 'needn', 'wouldn', 'had', "aren't", 'he', 't', 'against', "shan't", 'him', "isn't", 'you', 'did', 'doing', 'is', 'y', 'ain', 'these', 'on', 'yours', 'weren', 'mightn', 'thei

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
### Stem the words

stemmer = english_stemmer
c=[]
for word in words:
    c.append(stemmer.stem(word))
print (c)

['feel', 'lucki', 'found', 'use', 'phone', 'us', 'use', 'hard', 'phone', 'line', 'someon', 'upgrad', 'sold', 'one', 'son', 'like', 'old', 'one', 'final', 'fell', 'apart', 'year', 'want', 'upgrad', 'thank', 'seller', 'realli', 'appreci', 'honesti', 'said', 'use', 'phone', 'recommend', 'seller', 'high', 'would']


In [None]:
 #Write all the steps in one function now

def review_to_wordlist( review, remove_stopwords=True):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()

    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (True by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    b=[]
    stemmer = english_stemmer #PorterStemmer()
    for word in words:
        b.append(stemmer.stem(word))

    # 5. Return a list of words
    return(b)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(  data['Reviews'],data['Rating'],test_size=0.20, random_state=42)


In [None]:
 #Clean all the reviews

clean_train_reviews = []
for review in X_train:
    clean_train_reviews.append( " ".join(review_to_wordlist(review)))



  review_text = BeautifulSoup(review).get_text()
  review_text = BeautifulSoup(review).get_text()


In [None]:
clean_test_reviews = []
for review in X_test:
    clean_test_reviews.append( " ".join(review_to_wordlist(review)))

  review_text = BeautifulSoup(review).get_text()
  review_text = BeautifulSoup(review).get_text()


In [None]:
clean_train_reviews[0:5]

['phone awesom great buy anyon look get money',
 'muy bueno',
 'blackberri mobil somewhat good phone phone u sent u advertis look like toy phone',
 'excelent producto',
 'excel']

# CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming X_train is a list of text documents
X_train = clean_train_reviews

# Initialize CountVectorizer
vect = CountVectorizer()

# Fit and transform the training data
X_train_vectorized = vect.fit_transform(X_train)
print(X_train_vectorized)
# Get the feature names
feature_names = np.array(vect.get_feature_names_out())

# Check the length of feature names
print(len(feature_names))


  (0, 26729)	1
  (0, 2832)	1
  (0, 15710)	1
  (0, 5049)	1
  (0, 1779)	1
  (0, 21010)	1
  (0, 15009)	1
  (0, 22912)	1
  (1, 23430)	1
  (1, 4792)	1
  (2, 26729)	3
  (2, 21010)	1
  (2, 3947)	1
  (2, 22737)	1
  (2, 34046)	1
  (2, 15352)	1
  (2, 32439)	1
  (2, 659)	1
  (2, 20603)	1
  (2, 37548)	1
  (3, 12508)	1
  (3, 28394)	1
  (4, 12497)	1
  (5, 26729)	2
  (5, 25217)	1
  :	:
  (331020, 23723)	1
  (331020, 20755)	1
  (331020, 1193)	1
  (331020, 14551)	1
  (331020, 20235)	1
  (331020, 10115)	1
  (331020, 15436)	1
  (331020, 18493)	1
  (331020, 8550)	1
  (331020, 22399)	1
  (331020, 38102)	1
  (331020, 26382)	1
  (331020, 40502)	1
  (331020, 7740)	1
  (331020, 39882)	1
  (331021, 15710)	2
  (331021, 12497)	1
  (331021, 10285)	1
  (331021, 5319)	1
  (331021, 21137)	1
  (331021, 34290)	1
  (331021, 23089)	1
  (331021, 15084)	1
  (331021, 13259)	1
  (331021, 21115)	1
42184


# **Select Best Features**

# **LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have already defined and fitted the vectorizer 'vect' on the training set
X_train_vectorized = vect.fit_transform(X_train)

# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_vectorized, y_train)

# Transform the test set using the same vectorizer
X_test_vectorized = vect.transform(X_test)

# Make predictions on the test set
predictions = model.predict(X_test_vectorized)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Optionally, you can print a classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(y_test, predictions))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.6545023925757649

Classification Report:
               precision    recall  f1-score   support

           1       0.67      0.62      0.65     14422
           2       0.38      0.05      0.10      4806
           3       0.37      0.13      0.19      6364
           4       0.35      0.16      0.22     12246
           5       0.69      0.94      0.80     44918

    accuracy                           0.65     82756
   macro avg       0.49      0.38      0.39     82756
weighted avg       0.60      0.65      0.60     82756



In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 8975   229   460   640  4118]
 [ 1675   264   428   533  1906]
 [ 1147    96   822   908  3391]
 [  632    48   313  2020  9233]
 [  880    52   211  1692 42083]]


# **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have a vectorizer for text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the model
rf_model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = rf_model.predict(X_test_vectorized)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)


Accuracy: 0.6972304122963894
Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.74      0.68     14422
           2       0.59      0.24      0.34      4806
           3       0.56      0.27      0.37      6364
           4       0.55      0.28      0.37     12246
           5       0.75      0.90      0.82     44918

    accuracy                           0.70     82756
   macro avg       0.61      0.49      0.52     82756
weighted avg       0.67      0.70      0.67     82756



In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[10700   302   341   412  2667]
 [ 1719  1148   209   345  1385]
 [ 1411   192  1746   524  2491]
 [ 1139   152   362  3471  7122]
 [ 2063   153   476  1591 40635]]


# **DecisionTreeClassifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have a vectorizer for text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Decision Tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = dt_model.predict(X_test_vectorized)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)


Accuracy: 0.6376818599255644
Classification Report:
              precision    recall  f1-score   support

           1       0.57      0.64      0.60     14422
           2       0.36      0.26      0.30      4806
           3       0.36      0.29      0.32      6364
           4       0.40      0.28      0.33     12246
           5       0.75      0.82      0.79     44918

    accuracy                           0.64     82756
   macro avg       0.49      0.46      0.47     82756
weighted avg       0.61      0.64      0.62     82756



In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 9223   875   973   882  2469]
 [ 1447  1254   478   460  1167]
 [ 1190   364  1861   757  2192]
 [ 1172   400   702  3399  6573]
 [ 3068   607  1131  3077 37035]]


# **Multinomial Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix
# Assuming you have a vectorizer for text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes classifier
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = nb_model.predict(X_test_vectorized)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.5448910048818213
Classification Report:
              precision    recall  f1-score   support

           1       0.65      0.53      0.59     14422
           2       0.14      0.43      0.21      4806
           3       0.22      0.13      0.16      6364
           4       0.27      0.29      0.28     12246
           5       0.78      0.69      0.73     44918

    accuracy                           0.54     82756
   macro avg       0.41      0.41      0.39     82756
weighted avg       0.60      0.54      0.57     82756

Confusion Matrix:
[[ 7697  3970   735   704  1316]
 [ 1269  2060   409   540   528]
 [  944  1955   812  1250  1403]
 [  565  2230   655  3496  5300]
 [ 1299  4422  1075  7094 31028]]


# **TfidataVectorizer**

In [None]:
## TfIdata reflects importance of words for classification.

# Fit the TfidataVectorizer to the training data specifiying a minimum document frequency of 5.
# this might prevent overfitting
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer with min_data parameter
vect = TfidfVectorizer(min_df=5).fit(X_train)

# Get feature names
feature_names = vect.get_feature_names_out()

# Print the number of feature names
print(len(feature_names))


14165


# **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=25, stop_words='english')  # You can adjust max_features and other parameters

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)  # You can adjust n_estimators and other parameters

# Train the model
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
predictions = rf_model.predict(X_test_tfidf)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.5783870655904103
Classification Report:
              precision    recall  f1-score   support

           1       0.47      0.38      0.42     14422
           2       0.32      0.08      0.13      4806
           3       0.35      0.08      0.13      6364
           4       0.35      0.11      0.17     12246
           5       0.62      0.89      0.73     44918

    accuracy                           0.58     82756
   macro avg       0.42      0.31      0.32     82756
weighted avg       0.52      0.58      0.51     82756

Confusion Matrix:
[[ 5478   253   254   414  8023]
 [ 1211   374    97   265  2859]
 [ 1109   132   502   398  4223]
 [ 1166   142   226  1400  9312]
 [ 2687   254   361  1505 40111]]


# **Multinomial Naive Bayes**

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix


#Create a TF-Idata vectorizer
tfidata_vectorizer = TfidataVectorizer(max_features=50, stop_words='english')  # Adjust parameters based on your requirements

# Fit and transform the training data
X_train_tfidata = tfidata_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_tfidata = tfidata_vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes classifier
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train_tfidata, y_train)

# Make predictions on the test set
predictions = nb_model.predict(X_test_tfidata)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)
# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

NameError: name 'TfidataVectorizer' is not defined

# **DecisionTreeClassifier**

In [None]:
from sklearn.feature_extraction.text import TfidataVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


tfidata_vectorizer = TfidataVectorizer(max_features=50, stop_words='english')
X_train_tfidata = tfidata_vectorizer.fit_transform(X_train)
X_test_tfidata = tfidata_vectorizer.transform(X_test)

clf = DecisionTreeClassifier()
clf.fit(X_train_tfidata, y_train)
predictions = clf.predict(X_test_tfidata)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

 #Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:")
print(class_report)
# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)


# **n_Grams**

In [None]:
### fit the countVectorizer to the training data specifiying a minimum
# document frequencyof 5 and extracting 1-gram and 2-gram
vect = CountVectorizer(min_data=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

