In [1]:
#load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']=(10.0,8.0)
import seaborn as sns
from scipy import stats
from scipy.stats import norm

In [2]:
#load data
#import csv
train_text = pd.read_csv("training_text",sep='\|\|',engine='python',names=['Id','Text'])
train_var = pd.read_csv("training_variants")


In [3]:
test_text = pd.read_csv("test_text",sep='\|\|',engine='python',names=['Id','Text'],skiprows=1)
test_var = pd.read_csv("test_variants")
test=test_text.join(test_var)
test.drop('ID',axis=1,inplace=True)

test.head()

Unnamed: 0,Id,Text,Gene,Variation
0,0,2. This mutation resulted in a myeloproliferat...,ACSL4,R570S
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...,NAGLU,P521L
2,2,Vascular endothelial growth factor receptor (V...,PAH,L333F
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...,ING1,A148D
4,4,Abstract Retinoblastoma is a pediatric retina...,TMEM216,G77A


In [4]:
train_var.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [5]:
train=train_text.join(train_var)
train.drop('ID',axis=1,inplace=True)
train.head()

Unnamed: 0,Id,Text,Gene,Variation,Class
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4


In [6]:
train.describe(include=[np.object])

Unnamed: 0,Text,Gene,Variation
count,3321,3321,3321
unique,1921,264,2996
top,The PTEN (phosphatase and tensin homolog) phos...,BRCA1,Truncating Mutations
freq,53,264,93


In [7]:
train.drop('Id',axis=1).describe()

Unnamed: 0,Class
count,3321.0
mean,4.365854
std,2.309781
min,1.0
25%,2.0
50%,4.0
75%,7.0
max,9.0


In [8]:
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
from sklearn.feature_extraction import text 

stop = text.ENGLISH_STOP_WORDS.union(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','%'])

In [9]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True)

In [11]:
%%time
vec = vectorizer.fit_transform(train['Text'])

Wall time: 17min 32s


In [12]:
vectorizerGene = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True,vocabulary=vectorizer.get_feature_names())
vectorizerVariation = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True,vocabulary=vectorizer.get_feature_names())

In [13]:
vecGene = vectorizerGene.fit_transform(train['Gene'])
vecVariation =  vectorizerVariation.fit_transform(train['Variation'])

In [14]:
import scipy.sparse as sps

In [15]:
new=sps.hstack((vec,vecGene,vecVariation))

In [16]:
df_class= pd.get_dummies(train['Class'],prefix='Class')
df_class.head(10)

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0
7,1,0,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0


In [17]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(new,train['Class'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
result=pd.DataFrame(index=train.index,data=logistic.predict_proba(new),columns=['Class1','Class2','Class3','Class4','Class5','Class6','Class7','Class8','Class9',])
result.head(10)

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9
0,0.874102,0.025892,0.006372,0.013433,0.009583,0.02278,0.036898,0.004883,0.006058
1,0.05525,0.392365,0.021064,0.329067,0.053851,0.06503,0.068146,0.0069,0.008326
2,0.05525,0.392365,0.021064,0.329067,0.053851,0.06503,0.068146,0.0069,0.008326
3,0.05801,0.072549,0.112022,0.574819,0.079106,0.037368,0.051299,0.006648,0.008179
4,0.045537,0.045433,0.020327,0.67952,0.118997,0.046246,0.033167,0.005045,0.005728
5,0.045537,0.045433,0.020327,0.67952,0.118997,0.046246,0.033167,0.005045,0.005728
6,0.048576,0.048465,0.021683,0.525316,0.259755,0.049333,0.03538,0.005382,0.00611
7,0.495665,0.018053,0.019614,0.40723,0.024371,0.012493,0.014331,0.003569,0.004674
8,0.039077,0.041072,0.021921,0.709286,0.080084,0.037987,0.058242,0.005501,0.006832
9,0.043824,0.046643,0.024055,0.695673,0.061304,0.037638,0.076222,0.006313,0.008328


In [20]:
def stand(x):
    if x<0.01:
        return 0
    elif x>=0.8:
        return 1
    else:
        return x

In [21]:
result=result.applymap(stand)
result.head()

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9
0,1.0,0.025892,0.0,0.013433,0.0,0.02278,0.036898,0.0,0.0
1,0.05525,0.392365,0.021064,0.329067,0.053851,0.06503,0.068146,0.0,0.0
2,0.05525,0.392365,0.021064,0.329067,0.053851,0.06503,0.068146,0.0,0.0
3,0.05801,0.072549,0.112022,0.574819,0.079106,0.037368,0.051299,0.0,0.0
4,0.045537,0.045433,0.020327,0.67952,0.118997,0.046246,0.033167,0.0,0.0


In [23]:
%%time
#testvectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True,vocabulary=vectorizer.get_feature_names())
testvec = vectorizer.transform(test['Text'])


Wall time: 27min 37s


In [24]:
#testvectorizerGene = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True,vocabulary=vectorizer.get_feature_names())
#testvectorizerVariation = TfidfVectorizer(tokenizer=tokenize, stop_words=stop,lowercase=True,vocabulary=vectorizer.get_feature_names())

testvecGene = vectorizerGene.fit_transform(test['Gene'])
testvecVariation =  vectorizerVariation.fit_transform(test['Variation'])


In [25]:
testnew=sps.hstack((testvec,testvecGene,testvecVariation))

In [26]:
result['ID']=test['Id']
c=['Class1','Class2','Class3','Class4','Class5','Class6','Class7','Class8','Class9',]
result=pd.DataFrame(index=test.Id,data=logistic.predict_proba(testnew),columns=['Class1','Class2','Class3','Class4','Class5','Class6','Class7','Class8','Class9',])
result[c]=result[c].applymap(stand)
result.head(10)

Unnamed: 0_level_0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.037103,0.136367,0.010092,0.062095,0.021874,0.029369,0.691118,0.0,0.0
1,0.095125,0.091371,0.019526,0.415517,0.04181,0.062342,0.250853,0.010269,0.013186
2,0.093701,0.12536,0.021659,0.245855,0.053134,0.087869,0.348138,0.01165,0.012634
3,0.063906,0.140647,0.019713,0.120999,0.041185,0.032649,0.559474,0.0,0.011613
4,0.105183,0.049581,0.015891,0.365744,0.043305,0.032315,0.369537,0.0,0.0
5,0.142559,0.161218,0.018178,0.420719,0.0607,0.049369,0.125831,0.0,0.011915
6,0.09152,0.098679,0.050248,0.148377,0.033478,0.03286,0.52798,0.0,0.0
7,0.105854,0.086524,0.022242,0.392598,0.099362,0.070008,0.204564,0.0,0.0
8,0.153905,0.230365,0.020461,0.177221,0.042063,0.061332,0.290935,0.010926,0.012791
9,0.122082,0.124284,0.016565,0.172354,0.036184,0.036152,0.470446,0.010429,0.011504


In [27]:
result.to_csv("Submission5.csv")