In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

features = pd.read_csv('all.csv.gz' , sep=';', error_bad_lines=False, compression='gzip', dtype='str')

b'Skipping line 1437: expected 21 fields, saw 29\nSkipping line 3880: expected 21 fields, saw 29\nSkipping line 5741: expected 21 fields, saw 30\nSkipping line 5759: expected 21 fields, saw 30\nSkipping line 9816: expected 21 fields, saw 30\nSkipping line 31819: expected 21 fields, saw 24\n'
b'Skipping line 32862: expected 21 fields, saw 29\nSkipping line 35793: expected 21 fields, saw 30\nSkipping line 35794: expected 21 fields, saw 24\nSkipping line 35795: expected 21 fields, saw 30\nSkipping line 35796: expected 21 fields, saw 29\nSkipping line 35858: expected 21 fields, saw 32\nSkipping line 35859: expected 21 fields, saw 30\nSkipping line 35860: expected 21 fields, saw 24\nSkipping line 35861: expected 21 fields, saw 30\nSkipping line 35862: expected 21 fields, saw 29\nSkipping line 37564: expected 21 fields, saw 24\nSkipping line 37568: expected 21 fields, saw 32\nSkipping line 37569: expected 21 fields, saw 44\nSkipping line 37570: expected 21 fields, saw 32\nSkipping line 37571

In [52]:
print(features.columns)
features.shape

Index(['suf', 'nextCap', 'next2T', 'word', 'gaz', 'prevCap', 'next2W', 'cap',
       'prevT', 'prevW', 'nextT', 'simb', 'nextW', 'ini', 'pref', 'prev2Cap',
       'tag', 'prev2W', 'next2Cap', 'prev2T', 'class'],
      dtype='object')


(106446, 21)

In [53]:
features.sample(5)

Unnamed: 0,suf,nextCap,next2T,word,gaz,prevCap,next2W,cap,prevT,prevW,...,simb,nextW,ini,pref,prev2Cap,tag,prev2W,next2Cap,prev2T,class
27597,,min,v-fin,águas,,maxmin,apresentam,min,art,as,...,simb,estudadas,min,,,n,,min,,
24374,,min,punc,aos,,min,",",min,v-pcp,associados,...,alfa,quartzitos,min,,min,n,naqueles,min,adj,
51582,,min,prop,da,,min,conodont,min,n,fósseis,...,alfa,espécie,min,,min,v-pcp,os,maxmin,art,
74947,,min,prop,a,,min,snowball,min,adv,conforme,...,alfa,teoria,min,,min,art,(,maxmin,punc,
18285,,min,n,conjunto,,min,dados,min,pron-det,cada,...,alfa,de,min,,min,n,de,min,prp,


In [54]:
class_encoder = LabelEncoder()
target = class_encoder.fit_transform(features['class'].astype(str))
features.drop('class',axis=1,inplace=True)

In [55]:
data = pd.DataFrame()

for column in features.columns:
    feat_encoder = LabelEncoder()
    data[column] = feat_encoder.fit_transform(features[column].astype(str))
    
data.sample(5)

Unnamed: 0,suf,nextCap,next2T,word,gaz,prevCap,next2W,cap,prevT,prevW,nextT,simb,nextW,ini,pref,prev2Cap,tag,prev2W,next2Cap,prev2T
77562,5,2,6,4482,21,5,5977,4,18,36,13,11,7267,6,34,4,16,3483,2,6
61030,5,2,6,638,21,5,8285,2,23,4228,19,12,888,6,34,4,17,1212,2,6
33388,5,2,6,4141,21,5,11056,4,17,4430,1,52,11191,6,34,3,7,1520,2,14
5565,5,2,16,1672,21,5,10131,4,18,36,13,11,3372,6,34,4,2,13,2,16
65363,5,2,10,3249,21,5,10009,4,8,7376,17,11,3866,6,34,4,0,3434,1,15


In [56]:
from sklearn.svm import LinearSVC

model = LinearSVC()

acc = cross_val_score(model, data, target, cv=10, scoring='accuracy').mean()
print(acc)

0.858727947677


In [57]:
from sklearn import metrics
from sklearn.model_selection import KFold

## manual 10-fold cross-validation
kf = KFold(n_splits=10, random_state=None, shuffle=False)

predict_y = []
y_true = []

for train_index, test_index in kf.split(data):
    model = LinearSVC()
    
    X_train, X_test = data.values[train_index], data.values[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    model.fit(X_train, y_train)
    
    predict_y.extend(model.predict(X_test))
    y_true.extend(y_test)

print(class_encoder.classes_)
print(metrics.classification_report(y_true, predict_y))

['CONTEXTOgeologicoDeBACIA' 'EON' 'ERA' 'None' 'baciaSEDIMENTAR' 'epoca'
 'idade' 'nan' 'outro' 'periodo' 'sedimentaresCARBONATICAS'
 'sedimentaresORGANICAS' 'sedimentaresQUIMICAS'
 'sedimentaresSILICICLASTICAS' 'unidadeESTRATIGRAFICA']
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       592
          1       0.00      0.00      0.00        87
          2       0.14      0.01      0.01       129
          3       0.95      0.98      0.96    100536
          4       0.00      0.00      0.00       640
          5       0.16      0.03      0.06       352
          6       0.00      0.00      0.00       459
          7       0.00      0.00      0.00       160
          8       0.00      0.00      0.00      1107
          9       0.08      0.02      0.03       199
         10       0.00      0.00      0.00       209
         11       0.00      0.00      0.00        16
         12       0.00      0.00      0.00        23
         13       0.

  'precision', 'predicted', average, warn_for)


In [58]:
print(metrics.confusion_matrix(y_true, predict_y))

[[    0     0     0   574     0     0    18     0     0     0     0     0
      0     0     0]
 [    0     0     2    62     0     3     6     0     0     0     0     0
      0     7     7]
 [    0     0     1    78     0     2    15     0     0     0     0     0
      0     0    33]
 [    0     0     3 98117     0    52  1103     0     1    22     0     0
      0     6  1232]
 [    0     0     0   627     0     0    13     0     0     0     0     0
      0     0     0]
 [    0     0     1   226     0    12     5     1     1     5     0     1
      0     1    99]
 [    0     0     0   392     0     4     0     0     0     0     0     0
      0     0    63]
 [    0     0     0   151     0     2     0     0     0     3     0     0
      0     0     4]
 [    0     0     0  1084     0     0    19     0     0     1     0     0
      0     0     3]
 [    0     0     0   167     0     2     8     0     0     3     0     0
      0     0    19]
 [    0     0     0   208     0     0     1     0 