# Predict Tinnitus Diagnostic Based on NPL of hybrid patient data
_find relevant hidden features based on standard questionnaires and free-text description_

Load configuration

In [1]:
import os
PATH = os.getcwd() 
import sys
sys.path.append(PATH + '/../')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
from utils.config import Config
import pyLDAvis
import pyLDAvis.sklearn as sklearnvis
import pickle
pyLDAvis.enable_notebook()

print("Config loaded")

Config loaded


Load and visualize csv data with panda

In [2]:
# read entire data set
df = pd.read_csv(Config.csv_files[0], sep=';', encoding='ISO-8859-1')

# load additional info
df_add = pd.read_csv(Config.additional_text[0], sep=';', encoding='ISO-8859-1')

# merge courrier and info
df["courrier_add"] = df["courrier"].astype(str) + df_add["0"]
print("done")

done


In [3]:
courriers=df["courrier_add"]# this is what we should spare
courriers_cleaned=[courrier for courrier in courriers if isinstance(courrier,str)]

In [4]:
stop_words = Config.stop_words
print(stop_words)

['alors', 'au', 'aucuns', 'aussi', 'autre', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'des', 'du', 'dedans', 'dehors', 'depuis', 'devrait', 'doit', 'donc', 'dos', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'fait', 'faites', 'fois', 'font', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'là', 'ma', 'maintenant', 'mais', 'mes', 'mien', 'moins', 'mon', 'mot', 'même', 'ni', 'nommés', 'notre', 'nous', 'ou', 'où', 'par', 'parce', 'pas', 'peut', 'peu', 'plupart', 'pour', 'pourquoi', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'tellement', 'tels', 'tes', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'voient', 'vont', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être', 'qui', 'oui', 'vos', 'nos', 'merci', 'plus', 'docteur', 'cher

In [5]:
tf_vectorizer = CountVectorizer(encoding='ISO-8859-1',
                                strip_accents = None,
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 1)
dtm_tf = tf_vectorizer.fit_transform(courriers_cleaned)
print(dtm_tf.shape)
pickle.dump(tf_vectorizer, open('../data/tf_vectorizer.pk', 'wb'))
pickle.dump(dtm_tf, open('../data/dtm_tf.pk', 'wb'))

(2534, 15906)


In [6]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(courriers_cleaned)
print(dtm_tfidf.shape)
pickle.dump(tfidf_vectorizer, open('../data/tfidf_vectorizer.pk', 'wb'))
pickle.dump(dtm_tfidf, open('../data/dtm_tfidf.pk', 'wb'))



(2534, 15906)


In [9]:
# for TF DTM
n_components = 10
lda_tf = LatentDirichletAllocation(n_components=n_components, random_state=42)
lda_tf.fit(dtm_tf)
pickle.dump(lda_tf, open('../data/lda_tf.pk', 'wb'))


In [11]:
lda_tf = pickle.load( open('../data/lda_tf.pk', "rb" ) )
tf_vectorizer = pickle.load( open('../data/tf_vectorizer.pk', "rb" ) )
dtm_tf = pickle.load( open('../data/dtm_tf.pk', "rb" ) )
vis_data = sklearnvis.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(vis_data)


In [18]:
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda_tfidf.fit(dtm_tfidf)
pickle.dump(lda_tf, open('../data/lda_tfidf.pk', 'wb'))

In [10]:
lda_tfidf = pickle.load( open('../data/lda_tfidf.pk', "rb" ) )
tfidf_vectorizer = pickle.load( open('../data/tfidf_vectorizer.pk', "rb" ) )
dtm_tfidf = pickle.load( open('../data/dtm_tfidf.pk', "rb" ) )
vis_data = sklearnvis.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.display(vis_data)