In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

import math

In [17]:
X = pd.read_csv("dados/train_images.csv", header=None)
y = pd.read_csv("dados/train_labels.csv")["Volcano?"]
X_test = pd.read_csv("dados/test_images.csv", header=None)
y_test = pd.read_csv("dados/test_labels.csv")["Volcano?"]

In [15]:
print(X.shape)
print(y.shape)

print(X_test.shape)
print(y_test.shape)

(7000, 12100)
(7000,)
(2734, 12100)
(2734,)


In [12]:
print(X.head())
print(y.head())

print(X_test.head())
print(y_test.head())

   0      1      2      3      4      5      6      7      8      9      \
0     95    101     99    103     95     86     96     89     70    104   
1     91     92     91     89     92     93     96    101    107    104   
2     87     70     72     74     84     78     93    104    106    106   
3      0      0      0      0      0      0      0      0      0      0   
4    114    118    124    119     95    118    105    116    123    112   

   ...    12090  12091  12092  12093  12094  12095  12096  12097  12098  12099  
0  ...      111    107     92     89    103     99    117    116    118     96  
1  ...      103     92     93     95     98    105    104    100     90     81  
2  ...       84     71     95    102     94     80     91     80     84     90  
3  ...       94     81     89     84     80     90     92     80     88     96  
4  ...      116    113    102     93    109    104    106    117    111    115  

[5 rows x 12100 columns]
0    1
1    0
2    0
3    0
4    0
Na

In [19]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [22]:
classifier = SVC(kernel = 'linear' , random_state = 0)
classifier.fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
predictions = classifier.predict(X_test)
print(classification_report(y_test,predictions))
print("Score: {}".format(accuracy_score(y_test, predictions)))

             precision    recall  f1-score   support

          0       0.93      0.97      0.95      2300
          1       0.80      0.64      0.71       434

avg / total       0.91      0.92      0.91      2734

Score: 0.9180687637161667


In [25]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys

In [26]:
xb,yb = balanced_subsample(X,y);

In [30]:
print(xb.shape)
print(yb.shape)

print(X.shape)
print(y.shape)

(2000, 12100)
(2000,)
(7000, 12100)
(7000,)


In [31]:
classifier2 = SVC(kernel = 'linear' , random_state = 0)
classifier2.fit(xb,yb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
predictions2 = classifier2.predict(X_test)
print(classification_report(y_test,predictions2))
print("Score: {}".format(accuracy_score(y_test, predictions2)))

             precision    recall  f1-score   support

          0       0.96      0.85      0.91      2300
          1       0.52      0.83      0.64       434

avg / total       0.89      0.85      0.86      2734

Score: 0.8507681053401609


In [33]:
classifier3 = SVC(kernel = 'linear')
classifier3.fit(xb,yb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [34]:
predictions3 = classifier3.predict(X_test)
print(classification_report(y_test,predictions3))
print("Score: {}".format(accuracy_score(y_test, predictions3)))

             precision    recall  f1-score   support

          0       0.96      0.85      0.91      2300
          1       0.52      0.83      0.64       434

avg / total       0.89      0.85      0.86      2734

Score: 0.8507681053401609


In [36]:
classifier4 = SVC(kernel = 'linear',C = 0.1)
classifier4.fit(xb,yb)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
predictions4 = classifier4.predict(X_test)
print(classification_report(y_test,predictions4))
print("Score: {}".format(accuracy_score(y_test, predictions4)))

             precision    recall  f1-score   support

          0       0.96      0.85      0.91      2300
          1       0.52      0.83      0.64       434

avg / total       0.89      0.85      0.86      2734

Score: 0.8507681053401609


In [39]:
classifier5 = SVC(kernel = 'rbf')
classifier5.fit(xb,yb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
predictions5 = classifier5.predict(X_test)
print(classification_report(y_test,predictions5))
print("Score: {}".format(accuracy_score(y_test, predictions5)))

             precision    recall  f1-score   support

          0       0.96      0.92      0.94      2300
          1       0.66      0.79      0.72       434

avg / total       0.91      0.90      0.91      2734

Score: 0.9019751280175567


In [41]:
classifier6 = SVC(kernel = 'poly')
classifier6.fit(xb,yb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [42]:
predictions6 = classifier6.predict(X_test)
print(classification_report(y_test,predictions6))
print("Score: {}".format(accuracy_score(y_test, predictions6)))

             precision    recall  f1-score   support

          0       0.92      0.88      0.90      2300
          1       0.48      0.59      0.53       434

avg / total       0.85      0.84      0.84      2734

Score: 0.8354059985369422


In [43]:
classifier7 = SVC(kernel = 'sigmoid')
classifier7.fit(xb,yb)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
predictions7 = classifier7.predict(X_test)
print(classification_report(y_test,predictions7))
print("Score: {}".format(accuracy_score(y_test, predictions7)))

             precision    recall  f1-score   support

          0       0.84      0.51      0.63      2300
          1       0.16      0.49      0.24       434

avg / total       0.73      0.51      0.57      2734

Score: 0.5058522311631309


In [46]:
classifier8 = SVC(kernel = "rbf")
classifier8.fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
predictions8 = classifier8.predict(X_test)
print(classification_report(y_test,predictions8))
print("Score: {}".format(accuracy_score(y_test, predictions8)))

             precision    recall  f1-score   support

          0       0.90      1.00      0.95      2300
          1       0.97      0.42      0.59       434

avg / total       0.91      0.91      0.89      2734

Score: 0.9059985369422092


In [48]:
print(X.shape)
print(y.shape)

(7000, 12100)
(7000,)
