In [1]:
from google.colab import drive

drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import pandas as pd
from sklearn.svm import SVC
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('gdrive/My Drive/iProfi AI/Part 2/2/train_texts.csv')
data.head()

Unnamed: 0,id,text,author
0,0,-Бабушка!- вскричала малютка.- Возьми меня с с...,Dostoevsky
1,1,"Знал ли Скрудж об этом? Разумеется, знал. Да и...",Dostoevsky
2,2,"-С праздником, дядя, с радостью! Дай вам Бог в...",Dostoevsky
3,3,Мы высказали только главную передовую мысль на...,Dostoevsky
4,4,"I. Отдел литературный. Повести, романы, расска...",Dostoevsky


In [4]:
test_data = pd.read_csv('gdrive/My Drive/iProfi AI/Part 2/2/test_texts.csv')
test_data.head()

Unnamed: 0,id,text
0,1734,"Идти ему было немного; он даже знал, сколько ш..."
1,1735,"-Твой дедушка был немножко пиратом, а как увер..."
2,1736,"У меня был немецкий паспорт, годный еще на цел..."
3,1737,"-Ну как знаете,- сказал я и сделал вид, будто ..."
4,1738,Я отмахнулся.\n-На шестнадцать и пятьдесят одн...


In [45]:
X = data['text']
y = data['author']

In [46]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [47]:
X

<1734x70276 sparse matrix of type '<class 'numpy.float64'>'
	with 308312 stored elements in Compressed Sparse Row format>

In [48]:
X.shape

(1734, 70276)

In [49]:
names = {0: 'Dostoevsky', 1: 'Akunin', 2: 'Bulychev', 3: 'Chehov', 4: 'Gogol', 5: 'King', 6: 'Pratchett', 7: 'Remark'}

In [50]:
for item in names.items():
  print(item)

(0, 'Dostoevsky')
(1, 'Akunin')
(2, 'Bulychev')
(3, 'Chehov')
(4, 'Gogol')
(5, 'King')
(6, 'Pratchett')
(7, 'Remark')


In [51]:
# from tensorflow.keras.utils import to_categorical

for i in range(len(y)):
  for key, value in names.items():
    if y[i] == value:
      y[i] = int(key)

y

0       0
1       0
2       0
3       0
4       0
       ..
1729    7
1730    7
1731    7
1732    7
1733    7
Name: author, Length: 1734, dtype: object

In [52]:
y = np.array(y, dtype=int)

In [53]:
y

array([0, 0, 0, ..., 7, 7, 7])

In [40]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)

In [41]:
model = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(model, grid, scoring='accuracy', cv=cv)

In [42]:
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy')

In [54]:
test_X = test_data['text']

test_X = vectorizer.transform(test_X)

test_id = test_data['id']

In [79]:
from xgboost import XGBClassifier

xgboost = XGBClassifier(learning_rate=0.01,n_estimators=1000,
                                     min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='binary:logistic', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

xgboost.fit(X, y)

XGBClassifier(colsample_bytree=0.7, learning_rate=0.01, min_child_weight=0,
              n_estimators=1000, nthread=-1, objective='multi:softprob',
              reg_alpha=6e-05, seed=27, subsample=0.7)

In [80]:
submission = gs.best_estimator_.predict(test_X)

# submission = xgboost.predict(test_X)

In [81]:
authors_df = ['check']*len(submission)

for i in range(len(submission)):
  for key, value in names.items():
    if submission[i] == key:
      authors_df[i] = value

In [82]:
sub_df = pd.Series(authors_df)

In [83]:
sub = pd.concat([test_id, sub_df], axis=1)

In [84]:
sub

Unnamed: 0,id,0
0,1734,Dostoevsky
1,1735,Akunin
2,1736,King
3,1737,Chehov
4,1738,King
...,...,...
325,2059,Bulychev
326,2060,Pratchett
327,2061,Pratchett
328,2062,Pratchett


In [85]:
sub.rename(columns={0: 'author'}, inplace=True)

In [86]:
sub.to_csv('gdrive/My Drive/iProfi AI/Part 2/2/submission.csv', index=False)