## Importing Libraries

In [46]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
import gensim.downloader as api

## EDA

In [2]:
df = pd.read_csv('./dataset/train.csv')

In [3]:
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [4]:
df.describe()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH
count,2249698.0,2249698.0,2249698.0
mean,1499795.0,4000.456,4071.839
std,866194.4,3966.146,1351685.0
min,1.0,0.0,1.0
25%,749479.5,230.0,511.811
50%,1499558.0,2916.0,663.0
75%,2250664.0,6403.0,1062.992
max,2999999.0,13420.0,1885801000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 103.0+ MB


In [6]:
df['PRODUCT_TYPE_ID'].nunique()

12907

In [11]:
df.isna().sum()

PRODUCT_ID               0
TITLE                   12
BULLET_POINTS       837364
DESCRIPTION        1157381
PRODUCT_TYPE_ID          0
PRODUCT_LENGTH           0
dtype: int64

In [13]:
df.fillna('', inplace=True)

In [14]:
df.isna().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64

In [15]:
df['TOTAL_PRODUCT_INFORMATION'] = df['TITLE'] + ' ' + df['BULLET_POINTS'] + ' ' + df['DESCRIPTION']

df.drop(columns=['TITLE', 'BULLET_POINTS', 'DESCRIPTION'], inplace=True)

In [16]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,ArtzFolio Tulip Flowers Blackout Curtain for D...
1,2673191,2755,393.7,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2,2765088,7537,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
3,1594019,2996,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
4,283658,6112,598.424,The United Empire Loyalists: A Chronicle of th...


## Text Pre-Processing

### Lower Casing

In [17]:
df["TOTAL_PRODUCT_INFORMATION"] = df["TOTAL_PRODUCT_INFORMATION"].str.lower()

In [18]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flowers blackout curtain for d...
1,2673191,2755,393.7,marks & spencer girls' pyjama sets t86_2561c_n...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah women's cotton ankle length leggings c...
4,283658,6112,598.424,the united empire loyalists: a chronicle of th...


### Removing Punctuations

In [19]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [21]:
df['TOTAL_PRODUCT_INFORMATION'] = df['TOTAL_PRODUCT_INFORMATION'].str.replace(f"[{punctuations}]", "")

  df['TOTAL_PRODUCT_INFORMATION'] = df['TOTAL_PRODUCT_INFORMATION'].str.replace(f"[{punctuations}]", "")


In [22]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flowers blackout curtain for d...
1,2673191,2755,393.7,marks spencer girls pyjama sets t862561cnavy ...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah womens cotton ankle length leggings co...
4,283658,6112,598.424,the united empire loyalists a chronicle of the...


### Removing Stop Words

In [29]:
from nltk.corpus import stopwords

eng_stop_words = stopwords.words('english')
print(eng_stop_words)
print(type(eng_stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [34]:
def remove_stopwords(text):
    """
    Removes Stop Words from the text.
    """
    word_tokens = nltk.word_tokenize(text)
    words_without_stopwords = [word for word in word_tokens if word not in eng_stop_words]
    text_without_stopwords = " ".join(words_without_stopwords)
    return text_without_stopwords

df["TOTAL_PRODUCT_INFORMATION"] = df["TOTAL_PRODUCT_INFORMATION"].apply(remove_stopwords)

In [35]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flowers blackout curtain door ...
1,2673191,2755,393.7,marks spencer girls pyjama sets t862561cnavy m...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah womens cotton ankle length leggings co...
4,283658,6112,598.424,united empire loyalists chronicle great migration


### Lemmatization

In [39]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["TOTAL_PRODUCT_INFORMATION"] = df["TOTAL_PRODUCT_INFORMATION"].apply(lemmatize_words)

In [40]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flower blackout curtain door w...
1,2673191,2755,393.7,mark spencer girl pyjama set t862561cnavy mix9...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah woman cotton ankle length legging comb...
4,283658,6112,598.424,united empire loyalist chronicle great migration


### Removing URLs

In [41]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["TOTAL_PRODUCT_INFORMATION"] = df["TOTAL_PRODUCT_INFORMATION"].apply(remove_urls)

In [42]:
df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flower blackout curtain door w...
1,2673191,2755,393.7,mark spencer girl pyjama set t862561cnavy mix9...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah woman cotton ankle length legging comb...
4,283658,6112,598.424,united empire loyalist chronicle great migration


### Removing HTML Tags

In [43]:
def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df["TOTAL_PRODUCT_INFORMATION"] = df["TOTAL_PRODUCT_INFORMATION"].apply(remove_html_tags)

In [44]:
df.head(30)

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TOTAL_PRODUCT_INFORMATION
0,1925202,1650,2125.98,artzfolio tulip flower blackout curtain door w...
1,2673191,2755,393.7,mark spencer girl pyjama set t862561cnavy mix9...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah woman cotton ankle length legging comb...
4,283658,6112,598.424,united empire loyalist chronicle great migration
5,2152929,5725,950.0,hin metal bucket shape plant pot indoor outdoo...
6,413758,23,598.0,ungifted life journey
7,2026580,6030,984.251967,delavala self adhesive kitchen backsplash wall...
8,2050239,3302,393.7,puma cali sport clean woman sneaker white leat...
9,2998633,8201,393.700787,hexwell essential oil home fragrance oil aroma...


## Creating Word Embeddings

In [47]:
# Load the pre-trained GloVe embeddings
model_name = 'glove-wiki-gigaword-300'
model = api.load(model_name)



IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





In [52]:
# Tokenize the text and generate the embeddings
def embeddings(text):
    tokens = text.split()
    embeddings = []
    for token in tokens:
        if model.has_index_for(token):
            embeddings.append(model.get_vector(token))
        else:
            embeddings.append(np.zeros((300,), dtype=float))
    embeddings = np.array(embeddings)
    return embeddings

In [53]:
df['EMBEDDINGS'] = df['TOTAL_PRODUCT_INFORMATION'].apply(embeddings)

MemoryError: Unable to allocate 570. KiB for an array with shape (243, 300) and data type float64

In [None]:
test_df = pd.read_csv('./dataset/test.csv')

In [None]:
from xgboost import XGBRegressor

regressor = XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.01, eval_metric="mae")
regressor.fit(df['PRODUCT_TYPE_ID'], df["PRODUCT_LENGTH"])

In [None]:
predictions = regressor.predict(test_df['PRODUCT_TYPE_ID'])

In [None]:
predictions.shape

In [None]:
predictions

In [None]:
pred_df = pd.DataFrame({'PRODUCT_ID': test_df['PRODUCT_ID'], 'PRODUCT_LENGTH': predictions})

In [None]:
pred_df

In [None]:
pred_df.to_csv('./vague_prediction.csv', index=False)

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import mutual_info_score

print(mutual_info_score(df['PRODUCT_TYPE_ID'], df['PRODUCT_LENGTH']))

In [None]:
df['PRODUCT_TYPE_ID'].corr(df['PRODUCT_LENGTH'])