diff --git a/ML, Sales Market Prediction.ipynb b/ML, Sales Market Prediction.ipynb new file mode 100644 index 0000000..8fe4bb8 --- /dev/null +++ b/ML, Sales Market Prediction.ipynb @@ -0,0 +1,1920 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Big Mart Sales Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Objective: To find out the properties of a product, and store which impacts the sales of a product." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_csv(\"train.csv\")\n", + "test = pd.read_csv(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(8523, 13) (5681, 12) (14204, 13)\n" + ] + } + ], + "source": [ + "#Combine test and train into one file\n", + "train['source']='train'\n", + "test['source']='test'\n", + "data = pd.concat([train, test],ignore_index=True)\n", + "print(train.shape, test.shape, data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item_Fat_ContentItem_IdentifierItem_MRPItem_Outlet_SalesItem_TypeItem_VisibilityItem_WeightOutlet_Establishment_YearOutlet_IdentifierOutlet_Location_TypeOutlet_SizeOutlet_Typesource
0Low FatFDA15249.80923735.1380Dairy0.0160479.301999OUT049Tier 1MediumSupermarket Type1train
1RegularDRC0148.2692443.4228Soft Drinks0.0192785.922009OUT018Tier 3MediumSupermarket Type2train
2Low FatFDN15141.61802097.2700Meat0.01676017.501999OUT049Tier 1MediumSupermarket Type1train
3RegularFDX07182.0950732.3800Fruits and Vegetables0.00000019.201998OUT010Tier 3NaNGrocery Storetrain
4Low FatNCD1953.8614994.7052Household0.0000008.931987OUT013Tier 3HighSupermarket Type1train
\n", + "
" + ], + "text/plain": [ + " Item_Fat_Content Item_Identifier Item_MRP Item_Outlet_Sales \\\n", + "0 Low Fat FDA15 249.8092 3735.1380 \n", + "1 Regular DRC01 48.2692 443.4228 \n", + "2 Low Fat FDN15 141.6180 2097.2700 \n", + "3 Regular FDX07 182.0950 732.3800 \n", + "4 Low Fat NCD19 53.8614 994.7052 \n", + "\n", + " Item_Type Item_Visibility Item_Weight \\\n", + "0 Dairy 0.016047 9.30 \n", + "1 Soft Drinks 0.019278 5.92 \n", + "2 Meat 0.016760 17.50 \n", + "3 Fruits and Vegetables 0.000000 19.20 \n", + "4 Household 0.000000 8.93 \n", + "\n", + " Outlet_Establishment_Year Outlet_Identifier Outlet_Location_Type \\\n", + "0 1999 OUT049 Tier 1 \n", + "1 2009 OUT018 Tier 3 \n", + "2 1999 OUT049 Tier 1 \n", + "3 1998 OUT010 Tier 3 \n", + "4 1987 OUT013 Tier 3 \n", + "\n", + " Outlet_Size Outlet_Type source \n", + "0 Medium Supermarket Type1 train \n", + "1 Medium Supermarket Type2 train \n", + "2 Medium Supermarket Type1 train \n", + "3 NaN Grocery Store train \n", + "4 High Supermarket Type1 train " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item_MRPItem_Outlet_SalesItem_VisibilityItem_WeightOutlet_Establishment_Year
count14204.0000008523.00000014204.00000011765.00000014204.000000
mean141.0049772181.2889140.06595312.7928541997.830681
std62.0869381706.4996160.0514594.6525028.371664
min31.29000033.2900000.0000004.5550001985.000000
25%94.012000834.2474000.0270368.7100001987.000000
50%142.2470001794.3310000.05402112.6000001999.000000
75%185.8556003101.2964000.09403716.7500002004.000000
max266.88840013086.9648000.32839121.3500002009.000000
\n", + "
" + ], + "text/plain": [ + " Item_MRP Item_Outlet_Sales Item_Visibility Item_Weight \\\n", + "count 14204.000000 8523.000000 14204.000000 11765.000000 \n", + "mean 141.004977 2181.288914 0.065953 12.792854 \n", + "std 62.086938 1706.499616 0.051459 4.652502 \n", + "min 31.290000 33.290000 0.000000 4.555000 \n", + "25% 94.012000 834.247400 0.027036 8.710000 \n", + "50% 142.247000 1794.331000 0.054021 12.600000 \n", + "75% 185.855600 3101.296400 0.094037 16.750000 \n", + "max 266.888400 13086.964800 0.328391 21.350000 \n", + "\n", + " Outlet_Establishment_Year \n", + "count 14204.000000 \n", + "mean 1997.830681 \n", + "std 8.371664 \n", + "min 1985.000000 \n", + "25% 1987.000000 \n", + "50% 1999.000000 \n", + "75% 2004.000000 \n", + "max 2009.000000 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Numerical data summary:\n", + "data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Item_Fat_Content 0\n", + "Item_Identifier 0\n", + "Item_MRP 0\n", + "Item_Outlet_Sales 5681\n", + "Item_Type 0\n", + "Item_Visibility 0\n", + "Item_Weight 2439\n", + "Outlet_Establishment_Year 0\n", + "Outlet_Identifier 0\n", + "Outlet_Location_Type 0\n", + "Outlet_Size 4016\n", + "Outlet_Type 0\n", + "source 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Check missing values:\n", + "data.apply(lambda x: sum(x.isnull()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filling missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Medium 4655\n", + "Small 3980\n", + "High 1553\n", + "Name: Outlet_Size, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['Outlet_Size'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "data.Outlet_Size = data.Outlet_Size.fillna('Medium')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Item_Fat_Content 0\n", + "Item_Identifier 0\n", + "Item_MRP 0\n", + "Item_Outlet_Sales 0\n", + "Item_Type 0\n", + "Item_Visibility 0\n", + "Item_Weight 0\n", + "Outlet_Establishment_Year 0\n", + "Outlet_Identifier 0\n", + "Outlet_Location_Type 0\n", + "Outlet_Size 0\n", + "Outlet_Type 0\n", + "source 0\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.apply(lambda x: sum(x.isnull()))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 14204 entries, 0 to 14203\n", + "Data columns (total 13 columns):\n", + "Item_Fat_Content 14204 non-null object\n", + "Item_Identifier 14204 non-null object\n", + "Item_MRP 14204 non-null float64\n", + "Item_Outlet_Sales 14204 non-null float64\n", + "Item_Type 14204 non-null object\n", + "Item_Visibility 14204 non-null float64\n", + "Item_Weight 14204 non-null float64\n", + "Outlet_Establishment_Year 14204 non-null int64\n", + "Outlet_Identifier 14204 non-null object\n", + "Outlet_Location_Type 14204 non-null object\n", + "Outlet_Size 14204 non-null object\n", + "Outlet_Type 14204 non-null object\n", + "source 14204 non-null object\n", + "dtypes: float64(4), int64(1), object(8)\n", + "memory usage: 1.4+ MB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Food 10201\n", + "Non-Consumable 2686\n", + "Drinks 1317\n", + "Name: Item_Type_Combined, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Item type combine:\n", + "data['Item_Identifier'].value_counts()\n", + "data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])\n", + "data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',\n", + " 'NC':'Non-Consumable',\n", + " 'DR':'Drinks'})\n", + "data['Item_Type_Combined'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Numerical and One-Hot Coding of Categorical variables" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#Import library:\n", + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", + "le = LabelEncoder()\n", + "#New variable for outlet\n", + "data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])\n", + "var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']\n", + "le = LabelEncoder()\n", + "for i in var_mod:\n", + " data[i] = le.fit_transform(data[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "#One Hot Coding:\n", + "data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item_IdentifierItem_MRPItem_Outlet_SalesItem_TypeItem_VisibilityItem_WeightOutlet_Establishment_YearOutlet_IdentifiersourceItem_Fat_Content_0...Outlet_0Outlet_1Outlet_2Outlet_3Outlet_4Outlet_5Outlet_6Outlet_7Outlet_8Outlet_9
0FDA15249.80923735.1380Dairy0.0160479.301999OUT049train0...0000000001
1DRC0148.2692443.4228Soft Drinks0.0192785.922009OUT018train0...0001000000
2FDN15141.61802097.2700Meat0.01676017.501999OUT049train0...0000000001
3FDX07182.0950732.3800Fruits and Vegetables0.00000019.201998OUT010train0...1000000000
4NCD1953.8614994.7052Household0.0000008.931987OUT013train0...0100000000
\n", + "

5 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " Item_Identifier Item_MRP Item_Outlet_Sales Item_Type \\\n", + "0 FDA15 249.8092 3735.1380 Dairy \n", + "1 DRC01 48.2692 443.4228 Soft Drinks \n", + "2 FDN15 141.6180 2097.2700 Meat \n", + "3 FDX07 182.0950 732.3800 Fruits and Vegetables \n", + "4 NCD19 53.8614 994.7052 Household \n", + "\n", + " Item_Visibility Item_Weight Outlet_Establishment_Year Outlet_Identifier \\\n", + "0 0.016047 9.30 1999 OUT049 \n", + "1 0.019278 5.92 2009 OUT018 \n", + "2 0.016760 17.50 1999 OUT049 \n", + "3 0.000000 19.20 1998 OUT010 \n", + "4 0.000000 8.93 1987 OUT013 \n", + "\n", + " source Item_Fat_Content_0 ... Outlet_0 Outlet_1 Outlet_2 \\\n", + "0 train 0 ... 0 0 0 \n", + "1 train 0 ... 0 0 0 \n", + "2 train 0 ... 0 0 0 \n", + "3 train 0 ... 1 0 0 \n", + "4 train 0 ... 0 1 0 \n", + "\n", + " Outlet_3 Outlet_4 Outlet_5 Outlet_6 Outlet_7 Outlet_8 Outlet_9 \n", + "0 0 0 0 0 0 0 1 \n", + "1 1 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 1 \n", + "3 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 \n", + "\n", + "[5 rows x 37 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Item_Identifier object\n", + "Item_MRP float64\n", + "Item_Outlet_Sales float64\n", + "Item_Type object\n", + "Item_Visibility float64\n", + "Item_Weight float64\n", + "Outlet_Establishment_Year int64\n", + "Outlet_Identifier object\n", + "source object\n", + "Item_Fat_Content_0 uint8\n", + "Item_Fat_Content_1 uint8\n", + "Item_Fat_Content_2 uint8\n", + "Item_Fat_Content_3 uint8\n", + "Item_Fat_Content_4 uint8\n", + "Outlet_Location_Type_0 uint8\n", + "Outlet_Location_Type_1 uint8\n", + "Outlet_Location_Type_2 uint8\n", + "Outlet_Size_0 uint8\n", + "Outlet_Size_1 uint8\n", + "Outlet_Size_2 uint8\n", + "Outlet_Type_0 uint8\n", + "Outlet_Type_1 uint8\n", + "Outlet_Type_2 uint8\n", + "Outlet_Type_3 uint8\n", + "Item_Type_Combined_0 uint8\n", + "Item_Type_Combined_1 uint8\n", + "Item_Type_Combined_2 uint8\n", + "Outlet_0 uint8\n", + "Outlet_1 uint8\n", + "Outlet_2 uint8\n", + "Outlet_3 uint8\n", + "Outlet_4 uint8\n", + "Outlet_5 uint8\n", + "Outlet_6 uint8\n", + "Outlet_7 uint8\n", + "Outlet_8 uint8\n", + "Outlet_9 uint8\n", + "dtype: object" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exporting Data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "#Drop the columns which have been converted to different types:\n", + "data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)\n", + "\n", + "#Divide into test and train:\n", + "train = data.loc[data['source']==\"train\"]\n", + "test = data.loc[data['source']==\"test\"]\n", + "\n", + "#Drop unnecessary columns:\n", + "test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)\n", + "train.drop(['source'],axis=1,inplace=True)\n", + "\n", + "#Export files as modified versions:\n", + "train.to_csv(\"train_modified.csv\",index=False)\n", + "test.to_csv(\"test_modified.csv\",index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Building" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Reading modified data\n", + "train2 = pd.read_csv(\"train_modified.csv\")\n", + "test2 = pd.read_csv(\"test_modified.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item_IdentifierItem_MRPItem_Outlet_SalesItem_VisibilityItem_WeightOutlet_IdentifierItem_Fat_Content_0Item_Fat_Content_1Item_Fat_Content_2Item_Fat_Content_3...Outlet_0Outlet_1Outlet_2Outlet_3Outlet_4Outlet_5Outlet_6Outlet_7Outlet_8Outlet_9
0FDA15249.80923735.13800.0160479.30OUT0490100...0000000001
1DRC0148.2692443.42280.0192785.92OUT0180010...0001000000
2FDN15141.61802097.27000.01676017.50OUT0490100...0000000001
3FDX07182.0950732.38000.00000019.20OUT0100010...1000000000
4NCD1953.8614994.70520.0000008.93OUT0130100...0100000000
\n", + "

5 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + " Item_Identifier Item_MRP Item_Outlet_Sales Item_Visibility Item_Weight \\\n", + "0 FDA15 249.8092 3735.1380 0.016047 9.30 \n", + "1 DRC01 48.2692 443.4228 0.019278 5.92 \n", + "2 FDN15 141.6180 2097.2700 0.016760 17.50 \n", + "3 FDX07 182.0950 732.3800 0.000000 19.20 \n", + "4 NCD19 53.8614 994.7052 0.000000 8.93 \n", + "\n", + " Outlet_Identifier Item_Fat_Content_0 Item_Fat_Content_1 \\\n", + "0 OUT049 0 1 \n", + "1 OUT018 0 0 \n", + "2 OUT049 0 1 \n", + "3 OUT010 0 0 \n", + "4 OUT013 0 1 \n", + "\n", + " Item_Fat_Content_2 Item_Fat_Content_3 ... Outlet_0 Outlet_1 \\\n", + "0 0 0 ... 0 0 \n", + "1 1 0 ... 0 0 \n", + "2 0 0 ... 0 0 \n", + "3 1 0 ... 1 0 \n", + "4 0 0 ... 0 1 \n", + "\n", + " Outlet_2 Outlet_3 Outlet_4 Outlet_5 Outlet_6 Outlet_7 Outlet_8 \\\n", + "0 0 0 0 0 0 0 0 \n", + "1 0 1 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 \n", + "\n", + " Outlet_9 \n", + "0 1 \n", + "1 0 \n", + "2 1 \n", + "3 0 \n", + "4 0 \n", + "\n", + "[5 rows x 34 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = train2.drop(['Item_Outlet_Sales', 'Outlet_Identifier','Item_Identifier'], axis=1)\n", + "y_train = train2.Item_Outlet_Sales" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "X_test = test2.drop(['Outlet_Identifier','Item_Identifier'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Item_MRPItem_VisibilityItem_WeightItem_Fat_Content_0Item_Fat_Content_1Item_Fat_Content_2Item_Fat_Content_3Item_Fat_Content_4Outlet_Location_Type_0Outlet_Location_Type_1...Outlet_0Outlet_1Outlet_2Outlet_3Outlet_4Outlet_5Outlet_6Outlet_7Outlet_8Outlet_9
0249.80920.0160479.300100010...0000000001
148.26920.0192785.920010000...0001000000
2141.61800.01676017.500100010...0000000001
3182.09500.00000019.200010000...1000000000
453.86140.0000008.930100000...0100000000
\n", + "

5 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " Item_MRP Item_Visibility Item_Weight Item_Fat_Content_0 \\\n", + "0 249.8092 0.016047 9.30 0 \n", + "1 48.2692 0.019278 5.92 0 \n", + "2 141.6180 0.016760 17.50 0 \n", + "3 182.0950 0.000000 19.20 0 \n", + "4 53.8614 0.000000 8.93 0 \n", + "\n", + " Item_Fat_Content_1 Item_Fat_Content_2 Item_Fat_Content_3 \\\n", + "0 1 0 0 \n", + "1 0 1 0 \n", + "2 1 0 0 \n", + "3 0 1 0 \n", + "4 1 0 0 \n", + "\n", + " Item_Fat_Content_4 Outlet_Location_Type_0 Outlet_Location_Type_1 \\\n", + "0 0 1 0 \n", + "1 0 0 0 \n", + "2 0 1 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " ... Outlet_0 Outlet_1 Outlet_2 Outlet_3 Outlet_4 Outlet_5 \\\n", + "0 ... 0 0 0 0 0 0 \n", + "1 ... 0 0 0 1 0 0 \n", + "2 ... 0 0 0 0 0 0 \n", + "3 ... 1 0 0 0 0 0 \n", + "4 ... 0 1 0 0 0 0 \n", + "\n", + " Outlet_6 Outlet_7 Outlet_8 Outlet_9 \n", + "0 0 0 0 1 \n", + "1 0 0 0 0 \n", + "2 0 0 0 1 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3735.1380\n", + "1 443.4228\n", + "2 2097.2700\n", + "3 732.3800\n", + "4 994.7052\n", + "Name: Item_Outlet_Sales, dtype: float64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear Regression Model:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fitting Multiple Linear Regression to the training set\n", + "from sklearn.linear_model import LinearRegression\n", + "regressor = LinearRegression()\n", + "regressor.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# Predicting the test set results\n", + "y_pred = regressor.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,\n", + " 3565.6645235 , 1267.46171871])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "# Measuring Accuracy\n", + "from sklearn.metrics import accuracy_score, r2_score, mean_squared_error\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn import cross_validation, metrics\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "56.36" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_accuracy = round(regressor.score(X_train,y_train) * 100,2)\n", + "lr_accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.563589277727048" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train, regressor.predict(X_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "#Perform cross-validation:\n", + "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1150.93927648 1118.68414103 1112.89657923 1126.30724065 1140.59735737]\n" + ] + } + ], + "source": [ + "print(np.sqrt(np.abs(cv_score)))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE : 1127\n" + ] + } + ], + "source": [ + "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "submission = pd.DataFrame({\n", + "'Item_Identifier':test2['Item_Identifier'],\n", + "'Outlet_Identifier':test2['Outlet_Identifier'],\n", + "'Item_Outlet_Sales': y_pred\n", + "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "submission.to_csv('submission1.csv',index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Decision Tree Model:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeRegressor(criterion='mse', max_depth=15, max_features=None,\n", + " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", + " min_impurity_split=None, min_samples_leaf=300,\n", + " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", + " presort=False, random_state=None, splitter='best')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fitting Decision Tree Regression to the dataset\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "regressor = DecisionTreeRegressor(max_depth=15,min_samples_leaf=300)\n", + "regressor.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1673.98398729, 1349.51290433, 471.30684669, ..., 1892.06614452,\n", + " 3805.94860417, 1349.51290433])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predicting the test set results\n", + "y_pred = regressor.predict(X_test)\n", + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.59" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree_accuracy = round(regressor.score(X_train,y_train),2)\n", + "tree_accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5884050821570486" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train, regressor.predict(X_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1138.77137157 1109.42501179 1145.66395939 1113.2648073 1129.0816826 ]\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n", + "print(np.sqrt(np.abs(cv_score)))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE : 1095\n" + ] + } + ], + "source": [ + "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "submission = pd.DataFrame({\n", + "'Item_Identifier':test2['Item_Identifier'],\n", + "'Outlet_Identifier':test2['Outlet_Identifier'],\n", + "'Item_Outlet_Sales': y_pred\n", + "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "submission.to_csv('submission2.csv',index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forest Model:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,\n", + " max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=50, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,\n", + " oob_score=False, random_state=None, verbose=0, warm_start=False)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fitting Random Forest Regression to the dataset\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "regressor = RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=50,n_jobs=4)\n", + "regressor.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1643.87106725, 1364.24193091, 603.09113992, ..., 1957.62183676,\n", + " 3698.60040819, 1290.25320329])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predicting the test set results\n", + "y_pred = regressor.predict(X_test)\n", + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.61" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_accuracy = round(regressor.score(X_train,y_train),2)\n", + "rf_accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6125814698282157" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r2_score(y_train, regressor.predict(X_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1100.46298396 1077.70836131 1077.65325884 1069.0502564 1083.85364282]\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n", + "print(np.sqrt(np.abs(cv_score)))" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE : 1062\n" + ] + } + ], + "source": [ + "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "submission = pd.DataFrame({\n", + "'Item_Identifier':test2['Item_Identifier'],\n", + "'Outlet_Identifier':test2['Outlet_Identifier'],\n", + "'Item_Outlet_Sales': y_pred\n", + "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "submission.to_csv('submission3.csv',index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}