# **Read Data**

In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nadahamdi","key":"b72247aa1851f1d5f6367035f41b8432"}'}

In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d bittlingmayer/amazonreviews

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 98% 481M/493M [00:06<00:00, 56.2MB/s]
100% 493M/493M [00:06<00:00, 77.1MB/s]


In [None]:
!unzip amazonreviews.zip

Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [None]:
import bz2

output_path='/train.ft.txt'
with bz2.open('/content/train.ft.txt.bz2','rt',encoding='utf-8') as compressed_file,open(output_path,'w',encoding='utf-8') as output_file:
    for line in compressed_file :
        output_file.write(line)
    print('Train Successful')

output_path='/test.ft.txt'
with bz2.open('/content/test.ft.txt.bz2','rt',encoding='utf-8') as compressed_file,open(output_path,'w',encoding='utf-8') as output_file:
    for line in compressed_file :
        output_file.write(line)
    print('Test Successful')

Train Successful
Test Successful


In [None]:
from tqdm import tqdm
train,test,train_labels,test_labels=[],[],[],[]
with open ('/train.ft.txt','r',encoding='utf-8') as file :
    lines=file.readlines()
for line in tqdm(lines):
    train.append(line.split('__label__')[1][1:])
    train_labels.append(line.split('__label__')[1][0])
with open ('/test.ft.txt','r',encoding='utf-8') as file :
    lines=file.readlines()
for line in tqdm(lines):
    test.append(line.split('__label__')[1][1:])
    test_labels.append(line.split('__label__')[1][0])

100%|██████████| 3600000/3600000 [00:07<00:00, 494925.09it/s]
100%|██████████| 400000/400000 [00:01<00:00, 325085.22it/s]


In [None]:
import pandas as pd
train_data = {
  "review": train,
  "label": train_labels,
}
test_data = {
  "review": test,
  "label": test_labels,
}
test_data = pd.DataFrame(test_data)
train_data = pd.DataFrame(train_data)

# **Preprocessing**


In [None]:
train_data.drop_duplicates(subset='review',inplace=True)
test_data.drop_duplicates(subset='review',inplace=True)

In [None]:
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)

In [None]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE REVIEWS
#!pip install emoji
import re
import string
#import emoji

#Clean emojis from text
def strip_emoji(text):
    return emoji.get_emoji_regexp().sub(r"", text) # Remove emojis

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [None]:
# Apply the functions to clean the text in 'train_data'
#train_data['review'] = train_data['review'].apply(strip_emoji)
train_data['review'] = train_data['review'].apply(strip_all_entities)
train_data['review'] = train_data['review'].apply(clean_hashtags)
train_data['review'] = train_data['review'].apply(filter_chars)
train_data['review'] = train_data['review'].apply(remove_mult_spaces)


In [None]:
# Apply the functions to clean the text in 'test_data'
#test_data['review'] = test_data['review'].apply(strip_emoji)
test_data['review'] = test_data['review'].apply(strip_all_entities)
test_data['review'] = test_data['review'].apply(clean_hashtags)
test_data['review'] = test_data['review'].apply(filter_chars)
test_data['review'] = test_data['review'].apply(remove_mult_spaces)

In [None]:
def remove_digits(test_set):
    # Initialize an empty list to store the modified text
    text_without_digits = []

    # Iterate over each text in the test set
    for text in test_set:
        # Remove digits from the text using the `translate` method
        text_without_digits.append(text.translate(str.maketrans('', '', '0123456789')))

    return text_without_digits
test_data['review'] = remove_digits(test_data['review'])
train_data['review'] = remove_digits(train_data['review'])

In [None]:
test_data.head()

Unnamed: 0,review,label
0,great concept but need to be ready to create d...,1
1,awfulterriblevery poorly acted from the castin...,1
2,great minute workout i own just about all of ...,2
3,packaged wrong i bought two of these and both ...,1
4,a lot of stylistic posing without much musical...,1


In [None]:
# Remove rows with empty reviews
train_data = train_data.dropna(subset=['review'])

# if the review column contains whitespace strings but not NaN values
train_data = train_data[train_data['review'].str.strip() != '']

# Reset index after removing rows
train_data = train_data.reset_index(drop=True)

In [None]:
# Remove rows with empty reviews
test_data = test_data.dropna(subset=['review'])

# if the review column contains whitespace strings but not NaN values
test_data = test_data[test_data['review'].str.strip() != '']

# Reset index after removing rows
test_data = test_data.reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data using stratified sampling
X_train, X_val, y_train, y_val = train_test_split(
    train_data['review'],  # Features
    train_data['label'],                   # Labels
    test_size=0.2,                   # Percentage of data to use as validation set
    random_state=42,                 # Random seed for reproducibility
    stratify=train_data['label']           # Perform stratified sampling based on the labels
)

X_test = test_data['review'].values
y_test = test_data['label'].values
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)

In [None]:
X_train_p, _, y_train_p, _ = train_test_split(
    X_train,                    # Features
    y_train,                   # Labels
    train_size=0.005,                  # Percentage of data to use for training
    random_state=42,                 # Random seed for reproducibility
    stratify=y_train           # Perform stratified sampling based on the labels
)
X_val_p, _, y_val_p, _ = train_test_split(
    X_val,  # Features
    y_val,                   # Labels
    train_size=0.005,                  # Percentage of data to use for training
    random_state=42,                 # Random seed for reproducibility
    stratify=y_val           # Perform stratified sampling based on the labels
)
X_test_p, _, y_test_p, _ = train_test_split(
    X_test,  # Features
    y_test,                   # Labels
    train_size=0.005,                  # Percentage of data to use for training
    random_state=42,                 # Random seed for reproducibility
    stratify=y_test           # Perform stratified sampling based on the labels
)



In [None]:
!pip install langdetect
from langdetect import detect

# Function to detect language of a text
def detect_language(text):
    try:
        lang = detect(text)
        return lang == 'en'  # Return True if language is English, False otherwise
    except:
        return False  # Return False if language detection fails





In [None]:
import pandas as pd

# Filter X_train_p based on language detection
X_train_p = X_train_p[X_train_p.apply(detect_language)]

# Get the indices of the filtered rows
filtered_indices = X_train_p.index

# Filter y_train based on the same indices
y_train_p = y_train.loc[filtered_indices]

# Reset index after removing rows from X_train_p
X_train_p = X_train_p.reset_index(drop=True)



In [None]:
import pandas as pd

# Filter X_train_p based on language detection
X_val_p = X_val_p[X_val_p.apply(detect_language)]

# Get the indices of the filtered rows
filtered_indices = X_val_p.index

# Filter y_train based on the same indices
y_val_p = y_val.loc[filtered_indices]

# Reset index after removing rows from X_train_p
X_val_p = X_val_p.reset_index(drop=True)



In [None]:
X_test_p.head()

Unnamed: 0,0
333950,concept great in theory i purchased this for m...
109995,packaging was old and damaged did not come wit...
305938,cool video while its cool seeing steve francis...
269300,using with cox cable dvr i bought this to hoo...
188732,pretty this ring is very pretty and fits nicel...


In [None]:
import pandas as pd

X_test_p = X_test_p.squeeze()
X_test_p = X_test_p[X_test_p.apply(lambda x: detect_language(x[0]))==True]

# Get the indices of the filtered rows
filtered_indices = X_test_p.index

# Filter y_train based on the same indices
y_test_p = y_test.loc[filtered_indices]

# Reset index after removing rows from X_train_p
X_test_p = X_test_p.reset_index(drop=True)



# **NB Model**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Conversion of text to vector

v = CountVectorizer(stop_words='english')
X_train = v.fit_transform(X_train_p)
X_test = v.transform(X_test_p)
X_val = v.transform(X_val_p)

In [None]:
from sklearn.naive_bayes import MultinomialNB
multNB = MultinomialNB()
multNB.fit(X_train,y_train_p)

In [None]:
Y_pred = multNB.predict(X_test)

In [None]:
# Model performance
from sklearn.metrics import accuracy_score
accuracy_score(y_test_p,Y_pred)


0.7884615384615384