In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data.head()

In [4]:
train_data.shape
#we have over 2 million data

In [5]:
train_data.info()

In [6]:
#create a backup
backup = train_data
backup_test = test_data

In [7]:
#FIRST TASK IS TO CLEAN THE DESCRIPTION AND BULLET POINTS COLUMNS AND JOIN THEM

#TITLE
train_data[train_data["TITLE"].isna() == True ] 

In [8]:
#Drop these
train_data = train_data.dropna(subset=["TITLE"])


In [9]:
#check for NULL in description and bullet points both

duplicate_desc = train_data["DESCRIPTION"].isna()
duplicate_bullet = train_data["BULLET_POINTS"].isna()
both_duplicate = duplicate_bullet & duplicate_desc
train_data[both_duplicate]

In [10]:
#dropping these
train_data = train_data.dropna(subset=['DESCRIPTION', 'BULLET_POINTS'], how='all')

In [11]:
train_data.shape

In [12]:
train_data

In [13]:
#checking the remaining missing values

train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].fillna("")
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].fillna("")

In [14]:
train_data

In [15]:
train_data.describe()

In [16]:
#the max length product is an outlier 
train_data = train_data[train_data["PRODUCT_LENGTH"] < 10e+4]

In [17]:
train_data.shape

In [18]:
train_data = train_data[train_data["PRODUCT_LENGTH"] > 10]

In [19]:
train_data.describe()

In [20]:
train_data['TITLE'] = train_data['TITLE'].str[:40]

In [21]:
#now we will be cleaning the textual data like in title, description and bullet points

#Escaping out HTML characters
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(cleanhtml)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(cleanhtml)
train_data["TITLE"] = train_data["TITLE"].apply(cleanhtml)

In [22]:
#Encode from UTF-8 to ascii
def encode_decode(html):
    encode =html.encode('ascii','ignore')

    #decode from ascii to UTF-8
    decode=encode.decode(encoding='UTF-8')
    return decode

train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(encode_decode)
train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(encode_decode)
train_data["TITLE"] = train_data["TITLE"].apply(encode_decode)

In [23]:
#library for regular expressions
import re

def regexx(tweet):
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.\S+', "", tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    return tweet

train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(regexx)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(regexx)
train_data["TITLE"] = train_data["TITLE"].apply(regexx)


In [24]:
#dictionary consisting of the contraction and the actual value
def apos(tweet):
    Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
            "'d":" would","'ve":" have","'re":" are"}

    #replace the contractions
    for key,value in Apos_dict.items():
        if key in tweet:
            tweet=tweet.replace(key,value)

    return tweet

train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(apos)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(apos)
train_data["TITLE"] = train_data["TITLE"].apply(apos)


In [25]:
import re
#separate the words
def split(tweet):
    tweet = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",tweet) if s])
    return tweet

train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(split)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(split)
train_data["TITLE"] = train_data["TITLE"].apply(split)


In [26]:
#convert to lower case
def lower(tweet):
    tweet=tweet.lower()
    return tweet

train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(lower)
train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(lower)
train_data["TITLE"] = train_data["TITLE"].apply(lower)


In [27]:
import nltk
def stopw(tweet):
    #download the stopwords from nltk using
    nltk.download('stopwords')
    #import stopwords
    from nltk.corpus import stopwords

    #import english stopwords list from nltk
    stopwords_eng = stopwords.words('english')

    tweet_tokens=tweet.split()
    tweet_list=[]
    #remove stopwords
    for word in tweet_tokens:
        if word not in stopwords_eng:
            tweet_list.append(word)

    return tweet_list

In [28]:
# train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(stopw)
#pip install nltk

In [29]:
# train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(stopw)

In [30]:
#for string operations
import string	
def removepunc(tweet_list):	
  clean_tweet=[]
  #remove punctuations
  for word in tweet_list:
    if word not in string.punctuation:
      clean_tweet.append(word)

  return clean_tweet

In [31]:
# train_data["BULLET_POINTS"] = train_data["BULLET_POINTS"].apply(removepunc)
# train_data["DESCRIPTION"] = train_data["DESCRIPTION"].apply(removepunc)
# train_data["TITLE"] = train_data["TITLE"].apply(removepunc)

In [32]:
#make a new column called text
train_data["text"] = train_data["TITLE"] + train_data["BULLET_POINTS"] + train_data["DESCRIPTION"]

In [33]:
remove_duplicates = lambda x: ' '.join(list(set(x.split())))
train_data['text'] = train_data['text'].apply(remove_duplicates)

In [34]:
remove_symbols = lambda x: re.sub(r'[^\w\s]', '', x)
train_data['text'] = train_data['text'].apply(remove_symbols)

In [35]:
train_data["text"][0]

In [36]:
import nltk
nltk.download('stopwords')

In [37]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
#stop words
stop_words = set(stopwords.words('english'))
remove_stop_words = lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])

In [38]:
train_data['text'] = train_data['text'].apply(remove_stop_words)

In [39]:
train_data = train_data.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS'], axis=1)

In [40]:
train_data = train_data.reset_index(drop = True)

In [41]:
train_data

In [42]:
#train_data.to_csv("cleaned_train.csv",index=False)

In [43]:
backup = train_data
backup_test = test_data

In [44]:
#time to clean the test data (we cant hamper any rows ie cannot delete any of them)
test_data

In [45]:
test_data[test_data["TITLE"].isna()== True]

In [46]:
sample = pd.read_csv("sample_submission.csv")
sample

In [47]:
test_data[test_data["TITLE"].isna() == True]

In [48]:
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].fillna("")
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].fillna("")
test_data["TITLE"] = test_data["TITLE"].fillna("")

test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(cleanhtml)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(cleanhtml)
test_data["TITLE"] = test_data["TITLE"].apply(cleanhtml)

test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(encode_decode)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(encode_decode)
test_data["TITLE"] = test_data["TITLE"].apply(encode_decode)

test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(regexx)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(regexx)
test_data["TITLE"] = test_data["TITLE"].apply(regexx)

test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(apos)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(apos)
test_data["TITLE"] = test_data["TITLE"].apply(apos)

test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(split)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(split)
test_data["TITLE"] = test_data["TITLE"].apply(split)

test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(lower)
test_data["BULLET_POINTS"] = test_data["BULLET_POINTS"].apply(lower)
test_data["TITLE"] = test_data["TITLE"].apply(lower)

test_data["text"] = test_data["TITLE"] + test_data["BULLET_POINTS"] + test_data["DESCRIPTION"]

test_data['text'] = test_data['text'].apply(remove_duplicates)

test_data['text'] = test_data['text'].apply(remove_symbols)

test_data['text'] = test_data['text'].apply(remove_stop_words)

In [49]:
test_data = test_data.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS'], axis=1)
test_data

In [50]:
#test_data.to_csv("cleaned_test.csv",index=False)

In [51]:
sample = pd.read_csv("sample_submission.csv")
sample

In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [53]:
X = train_data.drop(columns=["PRODUCT_ID","PRODUCT_LENGTH"])
y = train_data["PRODUCT_LENGTH"]
X_test = test_data[["PRODUCT_TYPE_ID","text"]]

In [54]:
y

In [55]:
X_test

In [56]:
y_test = sample["PRODUCT_LENGTH"]

In [57]:
X

In [58]:
y = y.astype('float16')
y[np.isinf(y)] = 0
y.max()

In [59]:
y_test = y_test.astype("float16")
y_test

In [60]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
# from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

In [61]:
X

In [62]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(),[1])
],remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(X=X,y=y.astype("float16"))

# y_pred = pipe.predict(X_test)

In [63]:
#y_pred = pipe.predict(X_test)

In [64]:
#y_pred.to_csv("predicted_submission.csv",index=False)

In [65]:
train_data = pd.read_csv("cleaned_train.csv")
test_data = pd.read_csv("cleaned_test.csv")

In [66]:
train_data

In [67]:
import seaborn as sns
import matplotlib.pyplot as plt

In [68]:
#reducing size
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Perform random sampling on the training set
sample_size = 100000  # Set the desired sample size
train_data_sample = train_data.sample(n=sample_size, random_state=42)

# Save the sampled training data to a CSV file
train_data_sample.to_csv('sample.csv', index=False)


In [69]:
train_data_reduced = pd.read_csv("sample.csv")

In [70]:
train_data_reduced.describe()

In [71]:
sns.histplot(train_data_reduced['PRODUCT_LENGTH'])
plt.show()

In [None]:
#transforming our textual data with tfid vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your data
text_column = train_data_reduced["text"]

# Replace missing values with empty strings
text_column = np.where(text_column.isnull(), '', text_column)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on your text data
vectorizer.fit(text_column)

# Transform your text data into numerical values
text_column_numerical = vectorizer.transform(text_column)


#SAME FOR TEST DATA

# Load your data
text_column_test = test_data["text"]

# Replace missing values with empty strings_test
text_column_test = np.where(text_column_test.isnull(), '', text_column_test)

# Transform your text data into numerical values
text_column_numerical_test = vectorizer.transform(text_column_test)

In [None]:
text_column_numerical, text_column_numerical_test

In [None]:
test_data

In [None]:
X.shape

In [None]:
X_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack, csc_matrix
import sklearn.metrics as metrics
# Load your data
# text_column_numerical = TF-IDF values for your text column
# product_type_id_column = product_type_id column
# product_length_column = product_length column
product_type_id_column = train_data_reduced["PRODUCT_TYPE_ID"]
product_length_column = train_data_reduced["PRODUCT_LENGTH"]

product_type_id_column_test = test_data["PRODUCT_TYPE_ID"]

# Convert the text_column_numerical array into a CSC matrix
text_column_numerical = csc_matrix(text_column_numerical)
text_column_numerical_test = csc_matrix(text_column_numerical_test)


# Convert the product_type_id_column array into a CSC matrix
product_type_id_column = csc_matrix(product_type_id_column.to_numpy().reshape(-1, 1))
product_type_id_column_test = csc_matrix(product_type_id_column_test.to_numpy().reshape(-1, 1))


# Combine your input features into a single matrix
X = hstack([text_column_numerical, product_type_id_column])

X_test = hstack([text_column_numerical_test,product_type_id_column_test])

# Split your data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, product_length_column, test_size=0.2)
X_train = X
y_train = product_length_column
y_test = product_type_id_column_test
# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(y_test.toarray(),y_pred)))
mse = mean_squared_error(y_test.toarray(), y_pred)
print(f'Mean squared error: {mse}')
print(f'Score: {score}')

In [None]:
df = pd.DataFrame(test_data["PRODUCT_ID"])

# Add a new column with the y_pred values
df["y_pred"] = y_pred

# Set the column names
df.columns = ["PRODUCT_ID", "PRODUCT_LENGTH"]

In [None]:
sample = pd.read_csv("sample_submission.csv")

In [None]:
sample

In [None]:
df.to_csv("predicted_submission.csv", index=False)