# Predict Tinnitus Diagnostic Based on NPL of hybrid patient data
_find relevant hidden features based on standard questionnaires and free-text description_

Load configuration

In [1]:
import os
PATH = os.getcwd() 
import sys
sys.path.append(PATH + '/../')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
from utils.config import Config
import pyLDAvis
import pyLDAvis.sklearn as sklearnvis
import pickle
pyLDAvis.enable_notebook()

print("Config loaded")

Config loaded


Load and visualize csv data with panda

In [2]:
# read entire data set
df = pd.read_csv(Config.csv_files[0], sep=';', encoding='ISO-8859-1')

# load additional info
df_add = pd.read_csv(Config.additional_text[0], sep=';', encoding='ISO-8859-1')

# merge courrier and info
df["courrier_add"] = df["courrier"].astype(str) + df_add["0"]
print(df["courrier_add"])

0       Cher Confrère, \nCher Ami,\n\nMerci de mavoir...
1       Merci de m'avoir confié Monsieur NOTELET Marce...
2       Merci de mavoir confié Monsieur DAUPHIN Jean-...
3       \nMerci de mavoir confié Monsieur FLAHAUT Arn...
4                                                     NaN
                              ...                        
2598    Cher Confrère,\n\nMerci de m'avoir confié Mme ...
2599    nan fatigue surdité somnolence aigu progressif...
2600                     nan fatigue surdité aigu brutal 
2601    nan surdité aigu brutal  dépression anxiété hy...
2602    nan endormissement surdité somnolence pulsatil...
Name: courrier_add, Length: 2603, dtype: object


In [5]:
courriers=df["courrier_add"]# this is what we should spare
courriers_cleaned=[courrier for courrier in courriers if isinstance(courrier,str)]

In [3]:
stop_words = Config.stop_words


In [6]:
tf_vectorizer = CountVectorizer(encoding='ISO-8859-1',
                                strip_accents = None,
                                stop_words = stop_words,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 1)
dtm_tf = tf_vectorizer.fit_transform(courriers_cleaned)
print(dtm_tf.shape)
pickle.dump(tf_vectorizer, open('../data/tf_vectorizer.pk', 'wb'))
pickle.dump(dtm_tf, open('../data/dtm_tf.pk', 'wb'))

(2534, 15926)


In [7]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(courriers_cleaned)
print(dtm_tfidf.shape)
pickle.dump(tfidf_vectorizer, open('../data/tfidf_vectorizer.pk', 'wb'))
pickle.dump(dtm_tfidf, open('../data/dtm_tfidf.pk', 'wb'))



(2534, 15926)


In [13]:
# for TF DTM
n_components = 10
lda_tf = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda_tf.fit(dtm_tf)
pickle.dump(lda_tf, open('../data/lda_tf.pk', 'wb'))


In [14]:
lda_tf = pickle.load( open('../data/lda_tf.pk', "rb" ) )
tf_vectorizer = pickle.load( open('../data/tf_vectorizer.pk', "rb" ) )
dtm_tf = pickle.load( open('../data/dtm_tf.pk', "rb" ) )
vis_data = sklearnvis.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(vis_data)


In [18]:
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda_tfidf.fit(dtm_tfidf)
pickle.dump(lda_tf, open('../data/lda_tfidf.pk', 'wb'))

In [10]:
lda_tfidf = pickle.load( open('../data/lda_tfidf.pk', "rb" ) )
tfidf_vectorizer = pickle.load( open('../data/tfidf_vectorizer.pk', "rb" ) )
dtm_tfidf = pickle.load( open('../data/dtm_tfidf.pk', "rb" ) )
vis_data = sklearnvis.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.display(vis_data)