In [43]:
import pandas as pd
em=pd.read_csv('Nikedataset_cleaned.csv')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [64]:
# Remove excluded words from the 'normalized_text' column
excluded_words = ['bliblimegabrandfestival', 'ebay', 'ad', 'bids', 'united', 'phonewhatsapp', 'dm', 'gt', 'gtgtgt', 'core','pls','th','golf',
                  'poshmark','shopmycloset','closet','added','home','earps','low','atc','use','select','options','polo','blue',
                  'sickkicks','black','got','mary','want','product','school','goalkeeper','england','ready','pt','theyre'
                  ]
em['normalized_text'] = em['normalized_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in excluded_words]))

#Define TF-IDF vectorizer with updated text data
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit transform the updated data
dtm = tfidf.fit_transform(em['normalized_text'])
dtm

# Initialize NMF model
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=10, random_state=1000)
nmf_model.fit(dtm)

# Function to display topics
def display_topics(model, feature_names, num_top_words):
    for index, topic in enumerate(model.components_):
        print(f"({index}, (", end='')
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[-num_top_words:]]
        print(' + '.join([f'{weight:.3f}*" {word} "' for word, weight in reversed(top_features)]), end='')
        print("))")

# Display the topics
num_top_words = 50
feature_names = tfidf.get_feature_names_out()
display_topics(nmf_model, feature_names, num_top_words)


(0, (4.033*" sizes " + 3.513*" size " + 2.257*" sku " + 2.106*" new " + 0.875*" available " + 0.778*" white " + 0.561*" air " + 0.532*" force " + 0.417*" nike " + 0.381*" retro " + 0.283*" jordan " + 0.261*" sail " + 0.247*" mid " + 0.206*" anthracite " + 0.187*" grey " + 0.178*" lx " + 0.167*" nkdv " + 0.161*" summit " + 0.148*" sp " + 0.146*" prm " + 0.142*" sb " + 0.135*" blazer " + 0.131*" gs " + 0.113*" ap " + 0.106*" premium " + 0.103*" lv " + 0.103*" red " + 0.101*" milk " + 0.101*" metallic " + 0.091*" nkdj " + 0.091*" navy " + 0.088*" nkdq " + 0.087*" nocta " + 0.087*" brown " + 0.087*" silver " + 0.086*" midnight " + 0.084*" coconut " + 0.080*" nkdd " + 0.079*" zoom " + 0.078*" nkfj " + 0.076*" gum " + 0.076*" nkdz " + 0.075*" nikeairmaxapcu " + 0.075*" foamposite " + 0.075*" slides " + 0.075*" pro " + 0.072*" green " + 0.071*" cb " + 0.070*" light " + 0.070*" uptempo "))
(1, (3.116*" check " + 2.811*" listing " + 1.128*" mens " + 0.994*" sleeve " + 0.960*" drifit " + 0.958*"

In [65]:
topic_results=nmf_model.transform(dtm)
topic_results.argmax(axis=1)

array([4, 4, 4, ..., 4, 4, 8], dtype=int64)

In [66]:
em['Topic']=topic_results.argmax(axis=1)

In [67]:
mytopic_dict={
    0:'Size Availability',
    1:'Size Availability', 
    2:'Discount and Promotion',
    3:'Durability',
    4:'Durability',
    5:'Comfortability',
    6:'Comfortability',
    7:'Discount and Promotion',
    8:'Price',
    9:'Price'
}

em['Topic Label']=em['Topic'].map(mytopic_dict)

In [68]:
# Extract top ten keywords for each topic
def get_top_keywords(model, feature_names, n_top_words):
    top_keywords = {}
    for idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[-n_top_words:]]
        top_keywords[idx] = ', '.join(reversed(top_features))
    return top_keywords


In [69]:
#DATA PREPROCESSING
# Apply the function to extract top ten keywords
top_keywords = get_top_keywords(nmf_model, tfidf.get_feature_names_out(), 10)

# Add top ten keywords to the DataFrame
em['Top Ten Keywords'] = em['Topic'].apply(lambda x: top_keywords[x])
# Count the frequency of each aspect
aspect_counts = em['Topic Label'].value_counts()

# Display the aspect counts
print("Aspect Counts:")
print(aspect_counts)

Aspect Counts:
Topic Label
Comfortability            11534
Durability                10396
Size Availability          5090
Discount and Promotion     4123
Price                      2872
Name: count, dtype: int64


In [70]:
pd.set_option('display.max_colwidth', None)
# Display the modified DataFrame
df = pd.DataFrame(em[['Topic', 'Topic Label', 'Top Ten Keywords']])
df.tail(5)

Unnamed: 0,Topic,Topic Label,Top Ten Keywords
34010,4,Durability,"shirt, nike, football, away, wearing, adidas, sell, like, arsenal, make"
34011,1,Size Availability,"check, listing, mens, sleeve, drifit, size, fit, xl, short, large"
34012,4,Durability,"shirt, nike, football, away, wearing, adidas, sell, like, arsenal, make"
34013,4,Durability,"shirt, nike, football, away, wearing, adidas, sell, like, arsenal, make"
34014,8,Price,"sale, lower, size, buy, nike, sb, fleece, joggers, tech, dunk"


In [71]:
df.iloc[120910:120926]

Unnamed: 0,Topic,Topic Label,Top Ten Keywords


In [72]:
df = pd.DataFrame(em[['created_at','username','normalized_text', 'Topic Label']])
df.tail()

Unnamed: 0,created_at,username,normalized_text,Topic Label
34010,Wed Jan 18 18:45:35 +0000 2023,TicaGordita,bs wear nike shirt soliciting country gone absolutely nuts nuts,Durability
34011,Wed Jan 18 18:44:38 +0000 2023,Rustedrack,check listing nike mens shirt drifit uv protection orange size xl,Size Availability
34012,Wed Jan 18 18:44:10 +0000 2023,cynthia_collyer,absolutely pathetic nike adidas shirt would thing hire lawyers go mall us,Durability
34013,Wed Jan 18 18:39:14 +0000 2023,BTSwithOT7,im going bit less week amp stay seoul looking bts txt related spots kpop shopping areas top tourist spots amp interesting hidden experiences ie making customized nike seoul shirt nike store please send travel tips,Durability
34014,Wed Jan 18 18:31:22 +0000 2023,SneakerCheaper,nike acg floral shirt light bone amp iron ore extra cart already reduced deal nike sale sneakerhead kotd,Price


In [73]:
# Save the updated DataFrame to a new CSV file
df.to_csv('Nikedataset_aspects.csv', index=False)  # This will save the file in the same directory