<a href="https://colab.research.google.com/github/minhazulamin1/Toothpaste-Customer-Review-Text-Classification-Prediction/blob/main/Toothpaste_Text_Classification_%26_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=d0b55afd5a06b2353173966c462889dcca4888c3af72d5313ccbd77631f44d5e
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score
from sklearn.neighbors import NearestCentroid
import fasttext
import joblib
import warnings

# Reading dataset as dataframe
df = pd.read_csv("/content/Labelled.csv")
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Converting structured categorical features to numerical features
df['Label'] = df['Label'].map({'Positive':0, 'Negative':1})

# Cleaning Review
def cleaner(Text):
    if pd.isna(Text):
        return []
    soup = BeautifulSoup(Text, 'lxml')  # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and should be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|ð|Ÿ|Œ|˜|Š|ž|www|\\x)\S*", " ", souped)  # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+", " ", re1)  # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t, 'v') for t in filtered_result]
    return lemmas

df['cleaned_review'] = df.Text.apply(cleaner)
df = df[df['cleaned_review'].map(len) > 0] #removing rows with cleaned texts of length 0
print("Printing top 5 rows of dataframe showing original and cleaned texts....")
print(df[['Text','cleaned_review']].head())
df.drop(['Text'], axis=1, inplace=True)
# Saving cleaned text to csv
df.to_csv('cleaned_data.csv', index=False)
df['cleaned_review'] = [" ".join(row) for row in df['cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_review']
Y = df['Label'] # target column
tfidf = TfidfVectorizer(min_df=.03, ngram_range=(1,3)) # min_df=.03 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (0.03 * 1000 = 30).
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('vocabulary.csv', header=False, index=False)
print("Shape of tfidf matrix: ", data_tfidf.shape)

# Implementing Nearest Centroid classifier
model = NearestCentroid('cosine')  # cosine - Assesing Angle-based similarity, length invariance

# Suppress the specific warning
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.neighbors._nearest_centroid")

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
nc_scores = []
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    model.fit(X_train, Y_train) # Fitting the Nearest Centroid Classifier

    Y_pred = model.predict(X_test)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    nc_scores.append((recall, precision))
    print(f"Iteration {len(nc_scores)}, Recall: {recall}, Precision: {precision}")

mean_recall, mean_precision = np.mean(nc_scores, axis=0)
print("Mean cross-validation recall (Nearest Centroid): ", mean_recall)
print("Mean cross-validation precision (Nearest Centroid): ", mean_precision)


# Implementing FastText
# Function to write data to file in FastText format
def write_fasttext_file(filename, X, Y):
    with open(filename, 'w') as file:
        for text, label in zip(X, Y):
            line = f'__label__{label} {text}\n'
            file.write(line)

# Implementing Stratified K-Fold for cross-validation with FastText
ft_scores = []
iteration = 0

for train_index, test_index in kf.split(data, Y):
    iteration += 1
    print("Iteration ", iteration)

    # Split data
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Write training and testing data to files
    train_file = 'fasttext_train.txt'
    test_file = 'fasttext_test.txt'
    write_fasttext_file(train_file, X_train, Y_train)
    write_fasttext_file(test_file, X_test, Y_test)

    # Train FastText model
    model = fasttext.train_supervised(input=train_file,
                                      lr=0.5,           # Learning rate to balances fast learning without overshooting
                                      epoch=25,         # Number of epochs to create sufficient iterations for convergence
                                      dim=100,          # Vector dimension for adequate detail, manageable complexity
                                      minCount=2)       # Minimum count to filters out rare words

   # Evaluate model
    result = model.test(test_file)
    recall = result[1]
    Y_pred = [model.predict(text)[0][0] == '__label__1' for text in X_test]
    precision = precision_score(Y_test, Y_pred)
    ft_scores.append((recall, precision))
    print(f"Iteration {len(ft_scores)}, Recall: {recall}, Precision: {precision}")

mean_recall, mean_precision = np.mean(ft_scores, axis=0)
print("Mean cross-validation recall (FastText): ", mean_recall)
print("Mean cross-validation precision (FastText): ", mean_precision)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


Printing top 5 rows of dataframe showing original and cleaned texts....
                                                                                                                                                                                                                 Text  \
0                                  pleasant aroma gentle on gums quality product improves oral health perfect for daily use refreshing breath long-lasting effect improves oral health whitens teeth pleasant aroma 👎   
1                                 quality product pleasant aroma pleasant taste perfect for daily use improves oral health refreshing breath eco-friendly packaging gentle on gums refreshing breath gentle on gums 👎   
2                              long-lasting effect good value pleasant aroma perfect for daily use gentle on gums whitens teeth eco-friendly packaging quality product pleasant taste pleasant aroma pleasant aroma 🌿   
3                             highly recommend refreshing br

In [None]:

# data_tfidf,Y = smote.fit_resample(data_tfidf,Y)
# clf = fasttext().fit(data_tfidf, Y)
# joblib.dump(clf, 'svc.sav')

model.save_model('fasttext_model.bin')

In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# model = joblib.load('/content/svc.sav')
model.save_model('fasttext_model.bin')
vocabulary = pd.read_csv('/content/vocabulary.csv', header=None)
vocabulary_dict = {}
for i, word in enumerate(vocabulary[0]):
      vocabulary_dict[word] = i
print(vocabulary_dict)
tfidf = TfidfVectorizer(vocabulary = vocabulary_dict,lowercase=False)

# Reading new data as dataframe
df = pd.read_csv("/content/drive/MyDrive/Data Mining/CA 2/Unlabelled.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

# Cleaning reviews
def cleaner(Text):
    soup = BeautifulSoup(Text, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

df['Cleaned_review'] = df.Text.apply(cleaner)
df = df[df['Cleaned_review'].map(len) > 0] # removing rows with cleaned text of length 0
print("Printing top 5 rows of dataframe showing original and cleaned Texts....")
print(df[['Text','Cleaned_review']].head())
df['Cleaned_review'] = [" ".join(row) for row in df['Cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['Cleaned_review']
# tfidf.fit(data)
# data_tfidf = tfidf.transform(data)
# y_pred = model.predict(data_tfidf)

y_pred = [model.predict(text)[0][0] for text in df['Cleaned_review']]
y_pred = [label.split('__')[-1] for label in y_pred]

df['Predicted_Review'] = y_pred
df.to_csv('Predicted_Review.csv', index=False)

# #### Saving predicted ratings to csv
# df['Predicted_Review'] = y_pred.reshape(-1,1)
# df.to_csv('Predicted_Review.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'aroma': 0, 'aroma eco': 1, 'aroma eco friendly': 2, 'aroma good': 3, 'aroma good value': 4, 'aroma highly': 5, 'aroma highly recommend': 6, 'aroma improve': 7, 'aroma improve oral': 8, 'aroma long': 9, 'aroma long last': 10, 'aroma perfect': 11, 'aroma perfect daily': 12, 'aroma pleasant': 13, 'aroma pleasant taste': 14, 'aroma refresh': 15, 'aroma whiten': 16, 'bad': 17, 'bad smell': 18, 'bad smell cause': 19, 'bad smell daily': 20, 'bad smell harmful': 21, 'bad smell ineffective': 22, 'bad smell overprice': 23, 'bad smell recommend': 24, 'bad smell short': 25, 'bad smell strong': 26, 'bad smell whiten': 27, 'besttoothpaste': 28, 'breath': 29, 'breath eco': 30, 'breath eco friendly': 31, 'breath gentle': 32, 'breath gentle gum': 33, 'breath good': 34, 'breath good value': 35, 'breath highly': 36, 'breath highly recommend': 37, 'breath improve': 38, 'breath long': 39, 'breath long last': 40, 'breath perfect': 41, 'breath pleasant': 42, 'breath pleasant aroma': 43, 'breath pleasant ta