In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import string
import numpy as np
import re
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## loading spacy and stemmer
#### stemmer reduces words into their root forms which may be an actual word or not

In [2]:
nlp = spacy.load("en_core_web_sm")
stemmer = SnowballStemmer(language='english')

In [3]:
df_train = pd.read_csv("trainData.csv")
df_test = pd.read_csv('testData.csv')


In [4]:
df_train

Unnamed: 0,Category,Description
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
3388,negative,The company said that its comparable operating...
3389,negative,Operating result for the 12-month period decre...
3390,negative,HELSINKI Thomson Financial - Shares in Cargote...
3391,negative,Net sales of the Paper segment decreased to EU...


**remove punctuations, perform lemmatization and convert to lowercase, remove stop words(common words that usually dont have much meaning), and white spaces**

In [5]:
#add more stop words
words = ['abt', 'ab', 'www', 'abp', 'yit', ',', '.', '?','%', '00', 'mn', 'nm', 'mm', 'g', 'ad', 'www']
for word in words: 
    nlp.Defaults.stop_words.add(word)
    nlp.vocab[word].is_stop= True

In [6]:
len(nlp.Defaults.stop_words)

341

In [7]:
def process(doc):
    # Replace numbers with 'num'
    doc = re.sub(r'\b\d+(\.\d+)?\b', 'num', doc)
    
    # Apply spaCy pipeline
    spacy_doc = nlp(doc)
    

    stemmed_words = [stemmer.stem(token.text) for token in spacy_doc if not token.is_stop and not token.is_space and not token.is_punct and not token.like_num]
    return ' '.join(stemmed_words)

In [8]:
# Apply the processing function to the dataframe columns
df_train['processed_texts'] = df_train['Description'].apply(process)
df_test['processed_texts'] = df_test['Description'].apply(process)

In [9]:
#former phrase
df_test['Description'][3]

"HELSINKI ( AFX ) - Shares closed higher , led by Nokia after it announced plans to team up with Sanyo to manufacture 3G handsets , and by Nokian Tyres after its fourth-quarter earnings report beat analysts ' expectations , dealers said ."

In [10]:
#phrase after processing
df_test['processed_texts'][3]

'helsinki afx share close higher led nokia announc plan team sanyo manufactur handset nokian tyre quarter earn report beat analyst expect dealer said'

In [11]:
#Tfidf feature extration
#top most frequent words
#training the model
vectorizer = TfidfVectorizer(max_features=500)
tfidf_train = vectorizer.fit_transform(df_train['processed_texts'])
tfidf_test = vectorizer.transform(df_test['processed_texts'])

In [12]:
# Create a DataFrame with the TF-IDF features
feature_names = vectorizer.get_feature_names_out()
tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns=feature_names)
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=feature_names)
df_train_idf= pd.concat([df_train['Category'],tfidf_train_df ], axis =1)
df_test_idf= pd.concat([df_test['Category'],tfidf_test_df ], axis =1)



In [13]:
# Assume `df` is your TF-IDF DataFrame, excluding the category column
tfidf_values = df_train_idf.drop(columns=['Category'])

# Count the non-zero TF-IDF entries for each term
document_frequency = (tfidf_values != 0).sum()

# Convert to DataFrame for better readability
df_df = pd.DataFrame(document_frequency, columns=['Document_Frequency'])

df_df = df_df.sort_values(by='Document_Frequency', ascending=False)

In [14]:
#compute average tf-idf
average_tfidf_train = df_train_idf.groupby('Category').mean()
average_tfidf_train


Unnamed: 0_level_0,access,accord,account,acquir,acquisit,activ,ad,addit,administr,adp,...,water,wednesday,week,won,work,world,worth,www,year,yesterday
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
negative,0.0,0.008774,0.002628,0.0012,0.001362,0.0027,0.003902,0.008534,0.001759,0.003152,...,0.001475,0.005181,0.001706,0.0,0.006478,0.0,0.0,0.0,0.047293,0.0
neutral,0.002352,0.011479,0.004326,0.006673,0.005468,0.004938,0.004255,0.004405,0.002662,0.000434,...,0.003218,0.001347,0.002672,0.0,0.006217,0.004784,0.003106,0.004267,0.011725,0.001623
positive,0.001601,0.007595,0.002341,0.004204,0.009006,0.001862,0.001864,0.005072,0.001405,0.003788,...,0.000731,0.001026,0.002601,0.007105,0.004838,0.004836,0.00259,0.0,0.03738,0.003584


In [15]:
average_tfidf_train

Unnamed: 0_level_0,access,accord,account,acquir,acquisit,activ,ad,addit,administr,adp,...,water,wednesday,week,won,work,world,worth,www,year,yesterday
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
negative,0.0,0.008774,0.002628,0.0012,0.001362,0.0027,0.003902,0.008534,0.001759,0.003152,...,0.001475,0.005181,0.001706,0.0,0.006478,0.0,0.0,0.0,0.047293,0.0
neutral,0.002352,0.011479,0.004326,0.006673,0.005468,0.004938,0.004255,0.004405,0.002662,0.000434,...,0.003218,0.001347,0.002672,0.0,0.006217,0.004784,0.003106,0.004267,0.011725,0.001623
positive,0.001601,0.007595,0.002341,0.004204,0.009006,0.001862,0.001864,0.005072,0.001405,0.003788,...,0.000731,0.001026,0.002601,0.007105,0.004838,0.004836,0.00259,0.0,0.03738,0.003584


In [16]:
import os
#average_tfidf_train.to_csv('')
# Assuming 'average_tfidf_train' is your DataFrame
average_tfidf_train.to_csv('trained_average_tfidf_scores.csv', index=True)


In [17]:
average_tfidf_train.to_pickle('trained_average_tfidf_scores.pkl')


**check for baseline model**

In [18]:

most_frequent_category = df_test_idf['Category'].mode()[0]
most_frequent_category

'neutral'

In [19]:
count_most_frequent_category = (df_test_idf['Category'] == most_frequent_category).sum()
total_instances = df_test_idf['Category'].count()
baseline_accuracy = count_most_frequent_category / total_instances
print(f"Baseline Model Accuracy using test data alone: {baseline_accuracy:.2f}")


Baseline Model Accuracy using test data alone: 0.59


In [20]:

from sklearn.metrics.pairwise import cosine_similarity

average_tfidf_train = pd.read_csv('trained_average_tfidf_scores.csv', index_col='Category')  # Ensure the index_col is set appropriately

# Function to predict categories based on cosine similarity
def predict_category(tfidf_vector, centroids_df):
    # Ensure the input vector is a numpy array and reshape it for cosine_similarity
    if isinstance(tfidf_vector, pd.Series):
        tfidf_vector = tfidf_vector.values.reshape(1, -1)
    elif isinstance(tfidf_vector, np.ndarray):
        tfidf_vector = tfidf_vector.reshape(1, -1)
    else:
        raise ValueError("Input is neither a pandas Series nor a numpy array")
    
    # Compute cosine similarity and return the category with the highest similarity
    similarities = cosine_similarity(tfidf_vector, centroids_df.values)
    return centroids_df.index[np.argmax(similarities)]

# Apply the prediction to each row in the test TF-IDF DataFrame, excluding any non-numeric data
predicted_categories = [
    predict_category(row.drop(labels=['Category', 'Predicted_Category'], errors='ignore'), average_tfidf_train)
    for index, row in df_test_idf.iterrows()
]

# Append the predicted categories to the test DataFrame
df_test_idf['Predicted_Category'] = predicted_categories


In [21]:
accuracy = accuracy_score(df_test_idf['Category'], df_test_idf['Predicted_Category'])

# Print the accuracy
print("Accuracy:", accuracy)

# Generate and print the classification report to see precision, recall, and F1-score for each class
print(classification_report(df_test_idf['Category'], df_test_idf['Predicted_Category']))


Accuracy: 0.6586373021335169
              precision    recall  f1-score   support

    negative       0.39      0.65      0.49       181
     neutral       0.75      0.80      0.77       864
    positive       0.64      0.37      0.47       408

    accuracy                           0.66      1453
   macro avg       0.59      0.61      0.58      1453
weighted avg       0.68      0.66      0.65      1453



In [22]:
# Generate and print the confusion matrix
conf_matrix = confusion_matrix(df_test_idf['Category'], df_test_idf['Predicted_Category'])
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[118  55   8]
 [ 99 690  75]
 [ 84 175 149]]
