In [2]:
import re
import nltk
from nltk import word_tokenize,WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, LSTM, Dense
from keras import Input, Model
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lemur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lemur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lemur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lemur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## FONCTIONS ANNEXES

In [3]:
def preprocess(text):
    # Removing HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Removing punctuation and non-alphanumeric characters
    text = re.sub(r"[^a-zA-Z0-9]", " ",  str(text))

    # Tokenizing words
    words = word_tokenize(str(text))

    # Removing stopwords
    stop_words = set(stopwords.words("english"))
    words = [w for w in words if w.lower() not in stop_words]

    # Lemmatizing words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    return " ".join(words)

## ANALYSE DES DONNEES

In [4]:
pd.set_option('float_format', '{:f}'.format)
answers_df = pd.read_csv("data/Answers.csv", encoding='latin-1')
questions_df = pd.read_csv("data/Questions.csv", encoding='latin-1')
tags_df = pd.read_csv("data/Tags.csv", encoding='latin-1')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Remove all questions that have a negative score
questions_df = questions_df[questions_df['Score'] >= 0]
questions_df['Text'] = questions_df['Title']+questions_df['Body']
questions_df = questions_df.drop(['Title','Body','CreationDate','OwnerUserId'],axis=1)

In [None]:
questions_df

In [None]:
answers_df = answers_df.drop(['Id','OwnerUserId','CreationDate'],axis=1)
answers_df.rename(columns={'ParentId':'Id'},inplace=True)
out = questions_df.merge(answers_df,on='Id')

In [None]:
out

In [None]:
out = out.loc[out.groupby('Id')['Score_y'].idxmax()]

In [None]:
#most frequent tags
fig, ax = plt.subplots()
tags_df[tags_df['Tag']!='python']['Tag'].value_counts().sort_values(ascending = False)[:20].plot(ax=ax, kind='bar')

## NETTOYAGE


In [None]:
out['Text'].apply(lambda x:preprocess(x))

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(out['Text']+out['Body'])

In [None]:
questions_seq = tokenizer.texts_to_sequences(out['Text'])
answers_seq = tokenizer.texts_to_sequences(out['Body'])

In [None]:
#sequences trop longurs /// plot distrib sequences

# Pad the sequences to have the same length
maxlen_questions = max([len(x) for x in questions_seq])
maxlen_answer = max([len(x) for x in answers_seq])

In [None]:
questions_padded = pad_sequences(questions_seq, maxlen=512, padding='post')
answers_padded = pad_sequences(answers_seq, maxlen=512, padding='post')

In [None]:
questions_padded.shape

In [None]:
answers_padded.shape

In [None]:
#Split dataset
# set aside 80% of train and test data for evaluation
X_train, X_test, y_train, y_test = train_test_split(questions_padded,answers_padded,
    test_size=0.8, shuffle = True, random_state = 8)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
    test_size=0.25, random_state= 8)

## MODEL & TRAINNING

In [None]:
# Build the seq2seq model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(len(tokenizer.word_index) + 1, 128)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(len(tokenizer.word_index) + 1, 128)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy")

In [None]:
# Train the model
model.fit([X_train, y_train], y_train, validation_data=([X_val, y_val], y_val),
          batch_size=35, epochs=50)

In [None]:
# Save the trained model
model.save('myChatbot.h5')