In [0]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from google.colab import files
files.upload()

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [0]:
!pip install kaggle

In [0]:
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c mercari-price-suggestion-challenge

In [0]:
!ls

In [0]:
!apt-get install p7zip-full

In [0]:
!p7zip -d *.7z

In [0]:
# tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('train.tsv', delimiter='\t')
test = pd.read_csv('test.tsv', delimiter='\t')

In [0]:
# trainとtestのサイズを確認
train.shape, test.shape

In [0]:
train.head()

In [0]:
train.describe(include='all')

In [0]:
# データフレームの欠損データをまとめるテーブルの関数
def missing_values_table(df): 
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum()/len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        return mis_val_table_ren_columns 

# dataに欠損があるかどうか確認
missing_values_table(train)

## Price

## Shipping 

## category name

In [0]:
# / で区切られているカテゴリを分割する関数
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

In [0]:
train['general_cat'], train['subcat_1'], train['subcat_2'] = \
zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()

### general category

### sub category1

### sub category2

## Brand Name

## Item condition id

## 精度の確認

In [0]:
train = train.dropna(axis=0, how='any')

In [0]:
# x ＝ price以外の全ての値、y = price（ターゲット）で切り分ける
x_train_full, y_train_full = train.drop(['price'], axis=1), train.price

In [0]:
x_train_full_dropstr = x_train_full.drop(['name', 'item_description', 'category_name'], axis=1)
x_train_full_dropstr.head()

In [0]:
(X_train, X_test, y_train, y_test) = train_test_split(
    x_train_full_dropstr, y_train_full, test_size=0.8, random_state=0,
)

In [0]:
X_train = pd.get_dummies(X_train, columns=['item_condition_id','shipping','general_cat'])
X_test = pd.get_dummies(X_test[:4000], columns=['item_condition_id','shipping','general_cat'])

In [0]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

X_train['brand_name'] = le.fit_transform(X_train['brand_name'])
X_train['subcat_1'] = le.fit_transform(X_train['subcat_1'])
X_train['subcat_2'] = le.fit_transform(X_train['subcat_2'])

In [0]:
X_test['brand_name'] = le.fit_transform(X_test['brand_name'])
X_test['subcat_1'] = le.fit_transform(X_test['subcat_1'])
X_test['subcat_2'] = le.fit_transform(X_test['subcat_2'])

In [0]:
# モデルの作成
m = RandomForestRegressor(n_jobs=-1, min_samples_leaf=5, n_estimators=200)
m.fit(X_train, y_train)

In [0]:
# スコアを表示
m.score(X_train, y_train)

In [0]:
# スコアを表示
m.score(X_test, y_test)

In [0]:
X_train.describe()

In [0]:
X_test.head()