# TEST PIPELINE

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.base import TransformerMixin
import pandas as pd
import numpy as np
import csv
import re



## LOAD DATA

In [2]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
df.dropna(axis=0,inplace=True)
X = df.diagnosisRAW

## DECODE ISO-8859-2 & ENCODE ASCII

In [3]:
class AsciiTransformer(TransformerMixin):
    def transform(self,X,**transform_params):     
        return pd.Series(X.apply(lambda x: x.decode('ISO-8859-2').encode('ASCII','ignore')))
    def fit(self,X,y=None,**fit_params):
        return self
t1 = AsciiTransformer().fit_transform(X)
print t1[:10]
print len(t1)

0                               estava somente gripado
1    Me passou remdios para enchaqueca , soro e o d...
2        repouso e tomar agua e o remedio nimesulida!!
3                                                  yes
4                                                  60%
5                                               Hiniti
6     Pediu para eu ficar de repouso e no ir trabalhar
7                                               Fimose
8    Ele afirmou que eu no tinha nenhum tipo de zik...
9                                     Apenas uma gripe
Name: diagnosisRAW, dtype: object
6983


## MAKE LOWERCASE

In [4]:
class LowerCaseTransformer(TransformerMixin):
    def transform(self,X,**transform_params):
        return pd.Series(X.apply(lambda x: x.lower()))
    def fit(self,X,y=None,**fit_params):
        return self
t2 = LowerCaseTransformer().fit_transform(t1)
print t2[:10]
print len(t2)

0                               estava somente gripado
1    me passou remdios para enchaqueca , soro e o d...
2        repouso e tomar agua e o remedio nimesulida!!
3                                                  yes
4                                                  60%
5                                               hiniti
6     pediu para eu ficar de repouso e no ir trabalhar
7                                               fimose
8    ele afirmou que eu no tinha nenhum tipo de zik...
9                                     apenas uma gripe
Name: diagnosisRAW, dtype: object
6983


## REMOVE SYMBOLS

In [5]:
class RemoveSymsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return pd.Series(X.apply(lambda x: re.sub(re.compile(r'[^A-za-z0-9\s\.]'),' ',x)))
    def fit(self, X, y=None, **fit_params):
        return self
t3 = RemoveSymsTransformer().fit_transform(t2)
print t3[:10]
print len(t3)

0                               estava somente gripado
1    me passou remdios para enchaqueca   soro e o d...
2        repouso e tomar agua e o remedio nimesulida  
3                                                  yes
4                                                  60 
5                                               hiniti
6     pediu para eu ficar de repouso e no ir trabalhar
7                                               fimose
8    ele afirmou que eu no tinha nenhum tipo de zik...
9                                     apenas uma gripe
Name: diagnosisRAW, dtype: object
6983


## REMOVE STOP WORDS

In [6]:
class RemoveStopWordsTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        with open('pt_stop_words.txt','rb') as f:
            stop = []
            reader = csv.reader(f)
            for word in reader:
                word = word[0].split()[0].decode('ISO-8859-2').encode('ASCII','ignore')
                if word == '':
                    pass
                else:
                    stop.append(word)
        return pd.Series(X.apply(lambda x: ' '.join([token for token in x.split() if token not in stop])))
    def fit(self, X, y=None, **fit_params):
        return self
t4 = RemoveStopWordsTransformer().fit_transform(t3)
empty_index = []
for i,text in enumerate(t4):
    if text == '':
#         print '!!!!!!!!!!!!',i,text
        empty_index.append(i)
print t4[:10]

0                                      somente gripado
1    passou remdios enchaqueca soro diagnstico infe...
2                repouso tomar agua remedio nimesulida
3                                                  yes
4                                                   60
5                                               hiniti
6                     pediu ficar repouso ir trabalhar
7                                               fimose
8                        afirmou nenhum tipo zika saca
9                                         apenas gripe
Name: diagnosisRAW, dtype: object


## ZIKA WORD COUNTER

In [7]:
class ZikaCounterTransformer(TransformerMixin):
    def transform(self,X,**transform_params):
        return pd.Series(X.apply(lambda x: len(re.findall(r'z.{2}a',x))))
    def fit(self,X,y=None,**fit_params):
        return self
f1 = ZikaCounterTransformer().fit_transform(t4)
print f1[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    1
9    0
Name: diagnosisRAW, dtype: int64


## SENTIMENT ANALYZER
- not utilized since senntiment is built on english corpus
- in future, can use spanish twitter corpus OR GYANT's own message corpus

In [8]:
class SentimentTransformer(TransformerMixin):
    def transform(self,X,**transform_params):
        def senti_pos(text):
            sa = SentimentIntensityAnalyzer()
            return sa.polarity_scores(text)['pos'] 
        def senti_neg(text):
            sa = SentimentIntensityAnalyzer()
            return sa.polarity_scores(text)['neg'] 
        def senti_neu(text):
            sa = SentimentIntensityAnalyzer()
            return sa.polarity_scores(text)['neu'] 
        def senti_com(text):
            sa = SentimentIntensityAnalyzer()
            return sa.polarity_scores(text)['compound'] 
        functs = [senti_pos, senti_neu, senti_neg, senti_com]
        matrix = []
        for text in t4[:10]:
            senti = map(lambda x: x(text),functs)
            matrix.append(senti)
        return pd.DataFrame(data=matrix,columns=['positive','neutral','negative','compound'])
    def fit(self,X,y=None,**fit_params):
        return self    
f2 = SentimentTransformer().fit_transform(t4)
print f2[:10]

   positive  neutral  negative  compound
0       0.0    1.000     0.000    0.0000
1       0.0    0.755     0.245   -0.3818
2       0.0    1.000     0.000    0.0000
3       1.0    0.000     0.000    0.4019
4       0.0    1.000     0.000    0.0000
5       0.0    1.000     0.000    0.0000
6       0.0    1.000     0.000    0.0000
7       0.0    1.000     0.000    0.0000
8       0.0    1.000     0.000    0.0000
9       0.0    1.000     0.000    0.0000


## LATENT SEMANTIC ANALYSIS (LSA)

### TERM FREQUENCY - INVERSE DOCUMENt FREQUENCY (TFIDF)

In [9]:
f3A = TfidfVectorizer().fit_transform(t4)
print f3A[:10]

  (0, 1221)	0.774738690229
  (0, 2311)	0.632281552683
  (1, 1211)	0.328278267704
  (1, 1640)	0.194753740367
  (1, 1366)	0.391688937059
  (1, 1332)	0.297714342697
  (1, 764)	0.378040473811
  (1, 2314)	0.348517634656
  (1, 881)	0.422080727799
  (1, 2123)	0.29373878914
  (1, 1844)	0.288853851929
  (2, 1702)	0.618109229786
  (2, 2127)	0.415920620738
  (2, 83)	0.453269816647
  (2, 2413)	0.323205512456
  (2, 2146)	0.367472022518
  (3, 2601)	1.0
  (4, 15)	1.0
  (5, 1257)	1.0
  (6, 2434)	0.564960771084
  (6, 1378)	0.464578441423
  (6, 1083)	0.363377204117
  (6, 1854)	0.448218935295
  (6, 2146)	0.363377204117
  (7, 1095)	1.0
  (8, 2218)	0.569537170064
  (8, 2611)	0.23745402764
  (8, 2397)	0.463521333715
  (8, 1678)	0.328792737079
  (8, 71)	0.544321877117
  (9, 1222)	0.571860415168
  (9, 203)	0.820350940491


## SINGULAR VALUE DECOMPOSITION (SVD)
- similar to PCA, but more computationally efficient

In [10]:
f3B = TruncatedSVD(100).fit_transform(f3A)
print f3B[:10]

[[  1.01101093e-03   2.11702444e-04   2.14083732e-03   5.47976333e-04
   -2.19973156e-05  -6.29575732e-05   1.21791675e-05   5.98013608e-05
   -3.82815328e-05   1.36166220e-05   1.40613549e-04   2.12198118e-04
    3.02064615e-03   8.65935213e-03   2.09414897e-03   4.42110642e-03
    3.48819929e-06  -2.19266216e-03   3.81244262e-06   4.01084856e-04
    4.22200545e-04   5.35990040e-03  -8.58044254e-04   1.18370601e-02
    1.09820432e-02  -4.42713722e-03  -5.13438024e-04  -3.45213069e-05
    4.37222843e-03   2.13504274e-03   4.49593624e-03   8.51163580e-03
    1.40168133e-03  -1.52224328e-04  -7.51027583e-04  -1.11334526e-03
   -2.37188461e-04  -1.10297322e-03   3.64787207e-04   9.01875001e-05
    9.20821485e-04   3.04813264e-05  -7.57432167e-04  -6.53577987e-04
   -1.33653711e-03  -8.22523827e-04  -4.84720299e-04  -3.11429667e-04
   -2.69682784e-04  -2.29885169e-03  -1.21335619e-03  -8.78719028e-04
    8.75203895e-04  -1.17559735e-03  -1.60688364e-03   2.30106539e-04
   -2.04852040e-04  