# FILTERING SENTIMENT CATEGORIES

In [1]:
import pandas as pd

# Function to classify categories for each row based on keywords
def classify_category(row, category_keywords):
    category_priority = {
        "Footwear": 0,
        "Clothing": 1
    }

    # Initialize default aspect as 'Other'
    highest_priority_category = "Other"
    highest_priority = float('inf')

  # Check if normalized_text is not NaN
    if isinstance(row["normalized_text"], str):
        # Split the normalized_text into words
        words = row["normalized_text"].split()
        # Check each word in the row's 'normalized_text' field against the category_keywords
        for word in words:
            for category, keywords in category_keywords.items():
                if any(keyword in word for keyword in keywords):
                    if category_priority[category] < highest_priority:
                        highest_priority = category_priority[category]
                        highest_priority_category = category

    return highest_priority_category

# Load the CSV file
df = pd.read_csv('Nikedataset_aspects.csv')  

# Define the keywords for each category
category_keywords = {
    "Footwear": [
        "shoes", "shoe", "sneakers", "boots", "sandals", "high heels", "loafers", 
        "trainers", "slippers", "footwear", "running shoes", "hiking boots",
        "casual shoes", "formal shoes", "sports shoes", "kasut", "sneaker", "but",
        "selipar", "kasut tinggi", "loafer", "kasut sukan", "kasut lari", 
        "kasut mendaki", "kasut santai", "kasut rasmi","ultraboost","ultra boost",
        "superstar", "stan smith", "adizero", "predator", "nmd", "yeezy", "terrex", "alphabounce", "duramo",
        "air max", "air force", "vapormax", "cortez", "pegasus", "huarache", "mercurial", "sb dunk", "react", "zoomx",
        "suede", "rsx", "future rider", "king", "ultra", "ignite", "tazon", "fenty", "cali", "speedcat","proadapt","alphacat",
        "fusion","ignite","yeezys","dunk"
        
    ],
    "Clothing": [
        "shirt", "pants", "jacket", "dress", "coat", "skirt", "blouse", 
        "sweater", "jeans", "trousers", "top", "tee", "blazer", "hoodie",
        "underwear", "socks", "baju", "seluar", "jaket", "gaun", "kot", "skirt",
        "blaus", "sweater", "jeans", "seluar panjang", "atasan", "t-shirt", 
        "blazer", "hoodie", "pakaian dalam", "stokin",
        "tracksuit", "firebird", "adicolor", "tiro", "climacool", "condivo", "trefoil hoodie", "z.n.e.", "barricade",
        "pro combat", "dri-fit", "aeroswift", "thermafit", "flyknit", "windrunner", "tech fleece", "pro hypercool", "sphere", "hyperwarm",
        "essentials", "amplified", "last lap", "iconic t7", "nightcat", "evostripe", "active gaming", "performance tee", "liga", "archive"
    ]
}

# Apply the function to each row and update the 'Category' column
df['Category'] = df.apply(lambda row: classify_category(row, category_keywords), axis=1)

# Display the first few rows of the DataFrame with the updated "Category" column
print(df[['normalized_text', 'Category']].head())

# Count the occurrences of each category in the 'Category' column
category_counts = df['Category'].value_counts()

# Display the count of each category
print(category_counts)


                                     normalized_text  Category
0  nike trash adidas go favoriteexceptfor nike wr...  Clothing
1  team new new look season take advantage adidas...     Other
2  favorite breakfast shoes adidas derrick rose m...  Footwear
3  deal ladies youre suns fan adidas nmd r impact...  Footwear
4  select sizes available creamcarbongrey adidas ...  Clothing
Category
Other       22622
Clothing    18496
Footwear    18236
Name: count, dtype: int64


In [2]:
pd.set_option('display.max_colwidth', None)
df= pd.DataFrame(df[['created_at','username','normalized_text','Topic Label','Category']])
df.head()

Unnamed: 0,created_at,username,normalized_text,Topic Label,Category
0,Fri Sep 29 21:25:46 +0000 2023,alpha1906,nike trash adidas go favoriteexceptfor nike wrestling joints copped pandemic damn things comfortable hell,Comfortability,Clothing
1,Fri Sep 29 17:05:47 +0000 2023,ElkCitySptswear,team new new look season take advantage adidas limited time pinch hitter promotion contact us details today,Comfortability,Other
2,Fri Sep 29 14:18:11 +0000 2023,BeardedMangus,favorite breakfast shoes adidas derrick rose much comfortable court nike ive owned,Comfortability,Footwear
3,Fri Sep 29 07:33:35 +0000 2023,TheJoelParadox,deal ladies youre suns fan adidas nmd r impact orange nmd r comfortable especially uses hopefully helpful someone,Comfortability,Footwear
4,Fri Sep 29 04:15:11 +0000 2023,KicksDeals,select sizes available creamcarbongrey adidas harden vol free shipping buy promotion use code hoodieszn checkout,Size Availability,Clothing


In [7]:
df.iloc[13:18]

Unnamed: 0,created_at,username,normalized_text,Topic Label,Category
13,Tue Feb 27 15:15:29 +0000 2024,nuhuhuhhhh,bit misconception shirt sales players get anything clubs get tiny amount rest goes etc surprised mbappe anomaly case tho lol,Price,Clothing
14,Tue Feb 27 14:55:55 +0000 2024,lewisgiw,one worst chelsea shirt generic template iridescent badges make properly,Price,Clothing
15,Tue Feb 27 14:08:58 +0000 2024,rockstardjc,better value branded score draw shirt,Price,Clothing
16,Tue Feb 27 12:13:48 +0000 2024,Iroh_West,shirt bought miami missing believe one rurals took women good,Price,Clothing
17,Tue Feb 27 11:28:58 +0000 2024,SportsDIYguy,almost synonym boring stripped someone asks clean first thought like shirt name futura bold small team drives crazy,Price,Clothing


In [24]:
# Save the updated DataFrame to a new CSV file
#df.to_csv('Nikedataset_aspects_category.csv', index=False)  # This will save the file in the same directory
