In [1]:
from IPython.display import Image
import graphviz 
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.datasets import load_iris, load_wine, load_boston
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
origin_data = pd.read_csv('forbes-top-2000-companies/Forbes Top2000 2017.csv')
origin_data.shape

(2000, 10)

In [3]:
origin_data.head()

Unnamed: 0.1,Unnamed: 0,Rank,Company,Country,Sales,Profits,Assets,Market Value,Sector,Industry
0,,1,ICBC,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,,2,China Construction Bank,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,,3,Berkshire Hathaway,United States,222.9,24.1,620.9,409.9,Financials,Investment Services
3,,4,JPMorgan Chase,United States,102.5,24.2,2513.0,306.6,Financials,Major Banks
4,,5,Wells Fargo,United States,97.6,21.9,1943.4,274.4,Financials,Major Banks


In [4]:
origin_data.dtypes

Unnamed: 0      float64
 Rank             int64
Company          object
Country          object
Sales           float64
Profits         float64
Assets          float64
Market Value    float64
Sector           object
Industry         object
dtype: object

In [5]:
origin_data.isnull().sum()

Unnamed: 0      2000
 Rank              0
Company            0
Country            0
Sales              0
Profits            0
Assets             0
Market Value       0
Sector           197
Industry         491
dtype: int64

In [6]:
origin_data.drop(["Unnamed: 0", " Rank", "Company"], axis=1, inplace=True)

In [7]:
origin_data.head(10)

Unnamed: 0,Country,Sales,Profits,Assets,Market Value,Sector,Industry
0,China,151.4,42.0,3473.2,229.8,Financials,Major Banks
1,China,134.2,35.0,3016.6,200.5,Financials,Regional Banks
2,United States,222.9,24.1,620.9,409.9,Financials,Investment Services
3,United States,102.5,24.2,2513.0,306.6,Financials,Major Banks
4,United States,97.6,21.9,1943.4,274.4,Financials,Major Banks
5,China,115.7,27.8,2816.0,149.2,Financials,Regional Banks
6,United States,92.2,16.6,2196.8,231.9,Financials,Major Banks
7,China,113.1,24.9,2611.5,141.3,Financials,Major Banks
8,United States,217.5,45.2,331.1,752.0,Information Technology,Computer Hardware
9,Japan,249.9,17.1,412.5,171.9,Consumer Discretionary,Auto & Truck Manufacturers


In [8]:
origin_data.isnull().sum()

Country           0
Sales             0
Profits           0
Assets            0
Market Value      0
Sector          197
Industry        491
dtype: int64

In [9]:
data = origin_data.dropna(axis=0, how='any')
data.is_copy = False

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


### Если удалить пропуски

In [10]:
from sklearn.preprocessing import LabelEncoder


def encode_columns(origin_data):
    data = origin_data
    data.is_copy = False
    
    encoder = LabelEncoder()
    data["Country"] = encoder.fit_transform(data["Country"])
    data["Sector"] = encoder.fit_transform(data["Sector"])
    data["Industry"] = encoder.fit_transform(data["Industry"])
    return data

In [11]:
data = encode_columns(data)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

y = data['Profits']
data_X = data.drop(['Profits'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(data_X, y, random_state = 100)

clf = RandomForestRegressor(random_state=100)
clf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

In [13]:
from sklearn.metrics import r2_score, mean_squared_error

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print('root mean_squared_error train / test: {:0.3f} / {:0.3f}'.format(
    np.sqrt(mean_squared_error(y_train, pred_train)), np.sqrt(mean_squared_error(y_test, pred_test))))
print('r2_score train / test: {:0.3f} / {:0.3f}'.format(
    r2_score(y_train, pred_train), r2_score(y_test, pred_test)))

root mean_squared_error train / test: 0.756 / 1.484
r2_score train / test: 0.954 / 0.540


### Если импутировать пропуски

In [14]:
data = origin_data
data.is_copy = False

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


In [15]:
def impute_sector_industry(data):
    data['Sector'] = ['Other' if pd.isna(i) else i for i in data['Sector']]
    data['Industry'] = ['Other' if pd.isnull(i) else i for i in data['Industry']]    
    return data

In [16]:
data = impute_sector_industry(data)
data = encode_columns(data)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

y = data['Profits']
data_X = data.drop(['Profits'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(data_X, y, random_state = 100)

clf = RandomForestRegressor(random_state=100)
clf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

In [18]:
from sklearn.metrics import r2_score, mean_squared_error

pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

print('root mean_squared_error train / test: {:0.3f} / {:0.3f}'.format(
    np.sqrt(mean_squared_error(y_train, pred_train)), np.sqrt(mean_squared_error(y_test, pred_test))))
print('r2_score train / test: {:0.3f} / {:0.3f}'.format(
    r2_score(y_train, pred_train), r2_score(y_test, pred_test)))

root mean_squared_error train / test: 0.893 / 1.451
r2_score train / test: 0.917 / 0.597
