In [1]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text_list, label_list):
    processed_text_list = []
    for text in text_list:
        #remove tag
        text = re.sub('@[^\s]+','', text)
        
        #remove link
        text = re.sub('((www\.[^\s]+)|(http?://[^\s]+))','',text)
        
        # Remove punctuation
        text = re.sub("[^a-zA-Z]", " ", text)
        
        #lowercase and tokenize
        tokens = nltk.word_tokenize(text.lower())
        
        #remove stopwrods
        tokens = [word for word in tokens if word not in stop_words]  
        
        #stemming
        tokens = [stemmer.stem(word) for word in tokens]
        
        #lemmatization
        #tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        processed_text_list.append(' '.join(tokens))
    
    
    if label_list:
        df = pd.DataFrame({'text': processed_text_list, 'sentiment': label_list})

        # Remove NaN values
        df.dropna(inplace=True)

        # Remove duplicates
        df.drop_duplicates(subset='text', inplace=True)

        # Get the processed texts and corresponding labels
        processed_text_list = df['text'].astype(str).tolist()
        label_list = df['label'].astype(str).tolist()

        return processed_text_list, label_list
    
    return processed_text_list
    

def saveTxt(file_name, text_list, label_list):
    combined_list = [f"__label__{label} {text.strip()}\n" for label, text in zip(label_list, text_list)]
    
    with open(file_name,'w') as file:
        file.writelines(combined_list)

In [3]:
df_train = pd.read_csv("../train.csv")
print(df_train['sentiment'].value_counts(),'\n')

train_X = df_train['text'].astype(str).tolist()
train_y = df_train['sentiment'].astype(str).tolist()

df_test = pd.read_csv("../test.csv")
test_X = df_test['text'].astype(str).tolist()
test_y = df_test['sentiment'].astype(str).tolist()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64 



In [4]:
saveTxt('train.txt', preprocess(train_X), train_y)
saveTxt('test.txt', preprocess(test_X), test_y)

In [5]:
model = fasttext.train_supervised(input='train.txt', autotuneValidationFile='test.txt', autotuneModelSize="2M")
result = model.test('test.txt')
print(result)

# predicted_labels = [model.predict(text)[0][0].replace('__label__','') for text in test_X]

# report = classification_report(test_y, predicted_labels, output_dict=True)
# print(report)
# df_report = pd.DataFrame(report).transpose()

# plt.figure(figsize=(10, 6))
# sns.heatmap(df_report.iloc[:-1, :].astype(float), annot=True, cmap="Blues")
# plt.title('Classification Report')
# plt.savefig('classification_report.png')  # Save the plot as an image file
# plt.show()

Progress:  80.2% Trials:   17 Best score:  0.408192 ETA:   0h 0m59s
Aborting autotune...
Progress:  80.4% Trials:   17 Best score:  0.408192 ETA:   0h 0m58s
Training again with best arguments
Read 0M words
Number of words:  18578
Number of labels: 3
Progress: 100.0% words/sec/thread:  161091 lr:  0.000000 avg.loss:  1.099516 ETA:   0h 0m 0s
Progress: 100.0% words/sec/thread:  159820 lr:  0.000000 avg.loss:  1.092451 ETA:   0h 0m 0s


(3491, 0.4018905757662561, 0.4018905757662561)


In [6]:
model.save_model("model.ftz")