In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('trainbig.csv')
test = pd.read_csv('testbig.csv')

In [3]:
train['source'] = 'train'
test['source'] = 'test'

In [4]:
data = pd.concat([train,test],ignore_index=True)

In [5]:
print(test.shape , train.shape , data.shape)

(5681, 12) (8523, 13) (14204, 13)


In [6]:
data.apply(lambda x: sum(x.isnull()))


Item_Fat_Content                0
Item_Identifier                 0
Item_MRP                        0
Item_Outlet_Sales            5681
Item_Type                       0
Item_Visibility                 0
Item_Weight                  2439
Outlet_Establishment_Year       0
Outlet_Identifier               0
Outlet_Location_Type            0
Outlet_Size                  4016
Outlet_Type                     0
source                          0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Establishment_Year
count,14204.0,8523.0,14204.0,11765.0,14204.0
mean,141.004977,2181.288914,0.065953,12.792854,1997.830681
std,62.086938,1706.499616,0.051459,4.652502,8.371664
min,31.29,33.29,0.0,4.555,1985.0
25%,94.012,834.2474,0.027036,8.71,1987.0
50%,142.247,1794.331,0.054021,12.6,1999.0
75%,185.8556,3101.2964,0.094037,16.75,2004.0
max,266.8884,13086.9648,0.328391,21.35,2009.0


In [8]:
data.apply(lambda x: len(x.unique()))

Item_Fat_Content                 5
Item_Identifier               1559
Item_MRP                      8052
Item_Outlet_Sales             3494
Item_Type                       16
Item_Visibility              13006
Item_Weight                    416
Outlet_Establishment_Year        9
Outlet_Identifier               10
Outlet_Location_Type             3
Outlet_Size                      4
Outlet_Type                      4
source                           2
dtype: int64

In [9]:
categorical_col = [x for x in data.columns if data.dtypes[x]=='object']

In [10]:
categorical_col

['Item_Fat_Content',
 'Item_Identifier',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Location_Type',
 'Outlet_Size',
 'Outlet_Type',
 'source']

In [11]:
data['Item_Weight'] = data.groupby(by = 'Item_Identifier')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))

In [12]:
data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])


In [13]:
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])

In [14]:
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})

In [15]:
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']

In [16]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})

In [17]:
data.loc[data['Item_Type_Combined']=="Non-Consumable",'Item_Fat_Content'] = "Non-Edible"

In [18]:
mean_visibility = data['Item_Visibility'].mean()
mean_visibility

0.06595278007399345

In [19]:
data.loc[data['Item_Visibility']==0,'Item_Visibility'] = mean_visibility

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()


In [22]:
data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])
data['Item'] = le.fit_transform(data['Item_Identifier'])

In [23]:
var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [24]:
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)

In [25]:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

In [26]:
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [27]:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

In [28]:
train.head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type,Item_Type_Combined,Outlet_Years,Outlet,Item
0,0,FDA15,249.8092,3735.138,0.016047,9.3,OUT049,0,1,1,1,14,9,156
1,2,DRC01,48.2692,443.4228,0.019278,5.92,OUT018,2,1,2,0,4,3,8
2,0,FDN15,141.618,2097.27,0.01676,17.5,OUT049,0,1,1,1,14,9,662
3,2,FDX07,182.095,732.38,0.065953,19.2,OUT010,2,1,0,1,15,0,1121
4,1,NCD19,53.8614,994.7052,0.065953,8.93,OUT013,2,0,1,2,26,1,1297


In [29]:
features = ['Item_Fat_Content','Item','Item_MRP','Item_Visibility','Item_Weight','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet_Years','Outlet']

In [30]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
import numpy as np

In [31]:
def prediction_function(train, test):
    estimators = [50, 75, 80, 100, 125, 150, 200, 250, 500]
    final = []
    
    for e in estimators:
        rf = RandomForestRegressor(random_state = 1, n_estimators = e, min_samples_split = 8, min_samples_leaf = 4)
        gbr = GradientBoostingRegressor(random_state = 1, n_estimators = e, min_samples_split = 8, 
                                        min_samples_leaf = 4, learning_rate = 0.1)
        rdg = Ridge(alpha = 1,random_state = 1)

        rf.fit(train[features], train['Item_Outlet_Sales'])
        predictions_rf = rf.predict(train[features])
        predictions_rf = predictions_rf.astype(int)

        gbr.fit(train[features], train['Item_Outlet_Sales'])
        predictions_gbr = gbr.predict(train[features])
        predictions_gbr = predictions_gbr.astype(int)
        
        rdg.fit(train[features], train['Item_Outlet_Sales'])
        predictions_rdg = rdg.predict(train[features])
        predictions_rdg = predictions_rdg.astype(int)

        

        mse_rf = (np.sqrt(mean_squared_error(train['Item_Outlet_Sales'], predictions_rf)), 'RF')
        mse_gbr = (np.sqrt(mean_squared_error(train['Item_Outlet_Sales'], predictions_gbr)), 'GBR')
        mse_rdg = (np.sqrt(mean_squared_error(train['Item_Outlet_Sales'],predictions_rdg)),'RDG')
        
        
        print(mse_rf , mse_gbr, mse_rdg)
        error_min = min(mse_rf, min(mse_gbr,mse_rdg))
#         print(error_min)
        final.append((error_min, e))
    
    print(final)
    min_final = min(final)
    print("Minimum MSE, regressor to use and number of estimators: "+str(min_final))
    return list(min_final)

In [32]:
min_final = prediction_function(train, test)

(730.5427595716732, 'RF') (1056.4742730366704, 'GBR') (1194.0450694891595, 'RDG')
(727.860676858012, 'RF') (1046.068572888679, 'GBR') (1194.0450694891595, 'RDG')
(727.1622961277768, 'RF') (1044.535939356445, 'GBR') (1194.0450694891595, 'RDG')
(726.5497512765031, 'RF') (1038.6536674732015, 'GBR') (1194.0450694891595, 'RDG')
(725.5134146698408, 'RF') (1030.1002203146732, 'GBR') (1194.0450694891595, 'RDG')
(724.3619294067104, 'RF') (1022.237671716092, 'GBR') (1194.0450694891595, 'RDG')
(723.6756255445839, 'RF') (1004.9378961266949, 'GBR') (1194.0450694891595, 'RDG')
(723.3723606171947, 'RF') (991.8725274160408, 'GBR') (1194.0450694891595, 'RDG')
(722.1678552910503, 'RF') (937.3278243920021, 'GBR') (1194.0450694891595, 'RDG')
[((730.5427595716732, 'RF'), 50), ((727.860676858012, 'RF'), 75), ((727.1622961277768, 'RF'), 80), ((726.5497512765031, 'RF'), 100), ((725.5134146698408, 'RF'), 125), ((724.3619294067104, 'RF'), 150), ((723.6756255445839, 'RF'), 200), ((723.3723606171947, 'RF'), 250),

In [33]:
e = min_final[1]
regressor = min_final[0][1]

In [34]:
if(regressor == 'RF'):
    reg = RandomForestRegressor(random_state = 1, n_estimators = e, min_samples_split = 8, min_samples_leaf = 4)
elif(regressor == 'GBR'):
    reg = GradientBoostingRegressor(random_state = 1, n_estimators = e, min_samples_split = 8, min_samples_leaf = 4, learning_rate = 0.1)
else:
    rdg = Ridge(alpha = 1,random_state = 1)

reg.fit(train[features],train['Item_Outlet_Sales'])
predictions = reg.predict(test[features])
predictions = predictions.astype(int)

In [35]:
result = pd.DataFrame({'Item_Identifier' : test['Item_Identifier'], 'Outlet_Identifier' : test['Outlet_Identifier'], 'Item_Outlet_Sales' : predictions})

In [37]:
result.to_csv('Result.csv')