In [1]:
import pandas as pd
em=pd.read_csv('Adidasdataset_cleaned.csv')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [2]:
# Remove excluded words from the 'normalized_text' column
excluded_words = ['bliblimegabrandfestival', 'ebay', 'ad', 'bids', 'united', 'phonewhatsapp', 'dm', 'gt', 'gtgtgt', 'core','pls','th',]
em['normalized_text'] = em['normalized_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in excluded_words]))

# Define TF-IDF vectorizer with updated text data
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit transform the updated data
dtm = tfidf.fit_transform(em['normalized_text'])

# Initialize NMF model
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=5, random_state=1000)
nmf_model.fit(dtm)

# Function to display topics
def display_topics(model, feature_names, num_top_words):
    for index, topic in enumerate(model.components_):
        print(f"({index}, (", end='')
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[-num_top_words:]]
        print(' + '.join([f'{weight:.3f}*" {word} "' for word, weight in reversed(top_features)]), end='')
        print("))")

# Display the topics
num_top_words = 10
feature_names = tfidf.get_feature_names_out()
display_topics(nmf_model, feature_names, num_top_words)


(0, (2.884*" available " + 1.530*" adidas " + 1.450*" sneakerscouts " + 0.938*" sizes " + 0.901*" retail " + 0.871*" samba " + 0.839*" ultraboost " + 0.762*" og " + 0.752*" white " + 0.613*" free "))
(1, (2.385*" flash " + 2.130*" sale " + 1.839*" size " + 0.603*" adidas " + 0.080*" check " + 0.052*" price " + 0.052*" yeezy " + 0.041*" uk " + 0.040*" buy " + 0.037*" mens "))
(2, (2.118*" shirt " + 0.761*" ends " + 0.742*" currently " + 0.720*" watchers " + 0.714*" football " + 0.692*" pm " + 0.658*" adidas " + 0.612*" home " + 0.568*" away " + 0.408*" nufc "))
(3, (1.615*" shoes " + 1.483*" best " + 1.238*" adidas " + 0.579*" nike " + 0.547*" comfortable " + 0.417*" comfy " + 0.387*" like " + 0.370*" running " + 0.288*" look " + 0.256*" new "))
(4, (1.533*" discount " + 1.228*" promo " + 1.223*" send " + 1.140*" delivery " + 1.124*" telegram " + 1.010*" nationwide " + 0.544*" trainers " + 0.368*" adidas " + 0.273*" slider " + 0.259*" code "))


In [3]:
topic_results=nmf_model.transform(dtm)
topic_results.argmax(axis=1)

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)

In [4]:
em['Topic']=topic_results.argmax(axis=1)

In [5]:
mytopic_dict={
    0:'Size Availability',
    1:'Price',
    2:'Durability',
    3:'Comfortability',
    4:'Discount and Promotion'}

em['Topic Label']=em['Topic'].map(mytopic_dict)

In [6]:
# Extract top ten keywords for each topic
def get_top_keywords(model, feature_names, n_top_words):
    top_keywords = {}
    for idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[-n_top_words:]]
        top_keywords[idx] = ', '.join(reversed(top_features))
    return top_keywords


In [7]:
#DATA PREPROCESSING
# Apply the function to extract top ten keywords
top_keywords = get_top_keywords(nmf_model, tfidf.get_feature_names_out(), 10)

# Add top ten keywords to the DataFrame
em['Top Ten Keywords'] = em['Topic'].apply(lambda x: top_keywords[x])
# Count the frequency of each aspect
aspect_counts = em['Topic Label'].value_counts()

# Display the aspect counts
print("Aspect Counts:")
print(aspect_counts)

Aspect Counts:
Topic Label
Comfortability            8556
Durability                4420
Size Availability         4150
Discount and Promotion    1372
Price                     1311
Name: count, dtype: int64


In [8]:
pd.set_option('display.max_colwidth', None)
# Display the modified DataFrame
df = pd.DataFrame(em[['Topic', 'Topic Label', 'Top Ten Keywords']])
df.tail(5)

Unnamed: 0,Topic,Topic Label,Top Ten Keywords
19804,3,Comfortability,"shoes, best, adidas, nike, comfortable, comfy, like, running, look, new"
19805,3,Comfortability,"shoes, best, adidas, nike, comfortable, comfy, like, running, look, new"
19806,3,Comfortability,"shoes, best, adidas, nike, comfortable, comfy, like, running, look, new"
19807,3,Comfortability,"shoes, best, adidas, nike, comfortable, comfy, like, running, look, new"
19808,3,Comfortability,"shoes, best, adidas, nike, comfortable, comfy, like, running, look, new"


In [9]:
df.iloc[120910:120926]

Unnamed: 0,Topic,Topic Label,Top Ten Keywords


In [10]:
df = pd.DataFrame(em[['created_at','username','normalized_text', 'Topic Label']])
df.head()

Unnamed: 0,created_at,username,normalized_text,Topic Label
0,Fri Sep 29 21:25:46 +0000 2023,alpha1906,nike trash adidas go favoriteexceptfor nike wrestling joints copped pandemic damn things comfortable hell,Comfortability
1,Fri Sep 29 17:05:47 +0000 2023,ElkCitySptswear,team new new look season take advantage adidas limited time pinch hitter promotion contact us details today,Comfortability
2,Fri Sep 29 14:18:11 +0000 2023,BeardedMangus,favorite breakfast shoes adidas derrick rose much comfortable court nike ive owned,Comfortability
3,Fri Sep 29 07:33:35 +0000 2023,TheJoelParadox,deal ladies youre suns fan adidas nmd r impact orange nmd r comfortable especially uses hopefully helpful someone,Comfortability
4,Fri Sep 29 04:15:11 +0000 2023,KicksDeals,select sizes available creamcarbongrey adidas harden vol free shipping buy promotion use code hoodieszn checkout,Size Availability


In [11]:
# Save the updated DataFrame to a new CSV file
df.to_csv('Adidasdataset_aspects.csv', index=False)  # This will save the file in the same directory