In [5]:
#Importing all the package useful for the development 
import pandas as pd 
from unidecode import unidecode 
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 
import spacy
import warnings
warnings.filterwarnings("ignore")

# Loading the 2 CSVs
X_train = pd.read_csv('X_train.csv', delimiter=',', index_col=0)
y_train = pd.read_csv('Y_train.csv', delimiter=',', index_col=0)

# Concatening them to only have 1 DF
X_train = pd.concat([X_train, y_train], axis = 1)

# Merging the column designation with the description and put everything in lowercase
X_train["Text"] = X_train['designation'].fillna('').str.lower() + ' ' + X_train['description'].fillna('').str.lower()

# Cleaning the text
# Deleting special character and accent with unidecode
X_train['Text'] = X_train['Text'].apply(unidecode).astype('str')
# Deleting HTML code
X_train['Text'] = X_train['Text'].str.replace(r'<[^<>]*>', '', regex=True)
# Tokenisation et deleting words with less than 3 letters
tokenizer = RegexpTokenizer(r"[a-zA-Z-]{3,}")
X_train['Text'] = X_train['Text'].apply(lambda x: tokenizer.tokenize(x.lower()))

# Cleaning the DF
X_train.drop(['designation', 'description'], axis=1, inplace = True)

#Deleting the stop words
stop_words = set(stopwords.words(['english','french','german']))
# Adding in addition to the stop words, the words useless for us
parasite_words_words = ['plus', 'peut', 'etre', 'tout', 'cette', 'tres']
html_code_words = ['rsquo', 'eacute', 'agrave', 'egrave', 'div', 'span', 'class', 'nbsp', 'amp', 'ecirc', 'ccedil', 'laquo', 'raquo']
stop_words.update(parasite_words_words)
stop_words.update(html_code_words)
# Function to delete stop words from our DF
def stop_words_filtering(mots) :
    tokens = []
    for mot in mots:
        if mot not in stop_words:  
            tokens.append(mot)
    return tokens
#Deleting stop words from our DF using our function
X_train["Text"] = X_train["Text"].apply(stop_words_filtering)

#Lemmatization using Spacy and french dictionary
#nlp = spacy.load('fr_core_news_sm')                 #!python -m spacy download fr_core_news_sm     <- à mettre sous condition
#def spacy_lemmatizer(list):        
#    text = ' '.join(list)
#    doc = nlp(text)
#    list_output = [token.lemma_ for token in doc]
#    return ' '.join(list_output)
#X_train['Text'] = X_train['Text'].apply(spacy_lemmatizer)

# Initialiser le lemmatizer
lemmatizer = WordNetLemmatizer()
# Fonction pour lemmatiser une liste de tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]
X_train['Text'] = X_train['Text'].apply(lemmatize_tokens)

X_train['String'] = X_train['Text'].apply(lambda x: ' '.join(x))


In [6]:
#mettre sous csv
X_train.to_csv('text_lemm.csv', index=False)

In [23]:
X_train.head()

Unnamed: 0,productid,imageid,prdtypecode,Text,String
0,3804725264,1263597046,10,"[olivia, personalisiertes, notizbuch, seiten, ...",olivia personalisiertes notizbuch seiten punkt...
1,436067568,1008141237,2280,"[journal, art, ndeg, art, marche, salon, art, ...",journal art ndeg art marche salon art asiatiqu...
2,201115110,938777978,50,"[grand, stylet, ergonomique, bleu, gamepad, ni...",grand stylet ergonomique bleu gamepad nintendo...
3,50418756,457047496,1280,"[peluche, donald, europe, disneyland, marionne...",peluche donald europe disneyland marionnette d...
4,278535884,1077757786,2705,"[guerre, tuques, luc, grandeur, veut, organise...",guerre tuques luc grandeur veut organiser jeu ...
