diff --git a/ML, Sales Market Prediction.ipynb b/ML, Sales Market Prediction.ipynb
new file mode 100644
index 0000000..8fe4bb8
--- /dev/null
+++ b/ML, Sales Market Prediction.ipynb
@@ -0,0 +1,1920 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Big Mart Sales Dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Objective: To find out the properties of a product, and store which impacts the sales of a product."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train = pd.read_csv(\"train.csv\")\n",
+ "test = pd.read_csv(\"test.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(8523, 13) (5681, 12) (14204, 13)\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Combine test and train into one file\n",
+ "train['source']='train'\n",
+ "test['source']='test'\n",
+ "data = pd.concat([train, test],ignore_index=True)\n",
+ "print(train.shape, test.shape, data.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item_Fat_Content | \n",
+ " Item_Identifier | \n",
+ " Item_MRP | \n",
+ " Item_Outlet_Sales | \n",
+ " Item_Type | \n",
+ " Item_Visibility | \n",
+ " Item_Weight | \n",
+ " Outlet_Establishment_Year | \n",
+ " Outlet_Identifier | \n",
+ " Outlet_Location_Type | \n",
+ " Outlet_Size | \n",
+ " Outlet_Type | \n",
+ " source | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Low Fat | \n",
+ " FDA15 | \n",
+ " 249.8092 | \n",
+ " 3735.1380 | \n",
+ " Dairy | \n",
+ " 0.016047 | \n",
+ " 9.30 | \n",
+ " 1999 | \n",
+ " OUT049 | \n",
+ " Tier 1 | \n",
+ " Medium | \n",
+ " Supermarket Type1 | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Regular | \n",
+ " DRC01 | \n",
+ " 48.2692 | \n",
+ " 443.4228 | \n",
+ " Soft Drinks | \n",
+ " 0.019278 | \n",
+ " 5.92 | \n",
+ " 2009 | \n",
+ " OUT018 | \n",
+ " Tier 3 | \n",
+ " Medium | \n",
+ " Supermarket Type2 | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Low Fat | \n",
+ " FDN15 | \n",
+ " 141.6180 | \n",
+ " 2097.2700 | \n",
+ " Meat | \n",
+ " 0.016760 | \n",
+ " 17.50 | \n",
+ " 1999 | \n",
+ " OUT049 | \n",
+ " Tier 1 | \n",
+ " Medium | \n",
+ " Supermarket Type1 | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Regular | \n",
+ " FDX07 | \n",
+ " 182.0950 | \n",
+ " 732.3800 | \n",
+ " Fruits and Vegetables | \n",
+ " 0.000000 | \n",
+ " 19.20 | \n",
+ " 1998 | \n",
+ " OUT010 | \n",
+ " Tier 3 | \n",
+ " NaN | \n",
+ " Grocery Store | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Low Fat | \n",
+ " NCD19 | \n",
+ " 53.8614 | \n",
+ " 994.7052 | \n",
+ " Household | \n",
+ " 0.000000 | \n",
+ " 8.93 | \n",
+ " 1987 | \n",
+ " OUT013 | \n",
+ " Tier 3 | \n",
+ " High | \n",
+ " Supermarket Type1 | \n",
+ " train | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item_Fat_Content Item_Identifier Item_MRP Item_Outlet_Sales \\\n",
+ "0 Low Fat FDA15 249.8092 3735.1380 \n",
+ "1 Regular DRC01 48.2692 443.4228 \n",
+ "2 Low Fat FDN15 141.6180 2097.2700 \n",
+ "3 Regular FDX07 182.0950 732.3800 \n",
+ "4 Low Fat NCD19 53.8614 994.7052 \n",
+ "\n",
+ " Item_Type Item_Visibility Item_Weight \\\n",
+ "0 Dairy 0.016047 9.30 \n",
+ "1 Soft Drinks 0.019278 5.92 \n",
+ "2 Meat 0.016760 17.50 \n",
+ "3 Fruits and Vegetables 0.000000 19.20 \n",
+ "4 Household 0.000000 8.93 \n",
+ "\n",
+ " Outlet_Establishment_Year Outlet_Identifier Outlet_Location_Type \\\n",
+ "0 1999 OUT049 Tier 1 \n",
+ "1 2009 OUT018 Tier 3 \n",
+ "2 1999 OUT049 Tier 1 \n",
+ "3 1998 OUT010 Tier 3 \n",
+ "4 1987 OUT013 Tier 3 \n",
+ "\n",
+ " Outlet_Size Outlet_Type source \n",
+ "0 Medium Supermarket Type1 train \n",
+ "1 Medium Supermarket Type2 train \n",
+ "2 Medium Supermarket Type1 train \n",
+ "3 NaN Grocery Store train \n",
+ "4 High Supermarket Type1 train "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item_MRP | \n",
+ " Item_Outlet_Sales | \n",
+ " Item_Visibility | \n",
+ " Item_Weight | \n",
+ " Outlet_Establishment_Year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 14204.000000 | \n",
+ " 8523.000000 | \n",
+ " 14204.000000 | \n",
+ " 11765.000000 | \n",
+ " 14204.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 141.004977 | \n",
+ " 2181.288914 | \n",
+ " 0.065953 | \n",
+ " 12.792854 | \n",
+ " 1997.830681 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 62.086938 | \n",
+ " 1706.499616 | \n",
+ " 0.051459 | \n",
+ " 4.652502 | \n",
+ " 8.371664 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 31.290000 | \n",
+ " 33.290000 | \n",
+ " 0.000000 | \n",
+ " 4.555000 | \n",
+ " 1985.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 94.012000 | \n",
+ " 834.247400 | \n",
+ " 0.027036 | \n",
+ " 8.710000 | \n",
+ " 1987.000000 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 142.247000 | \n",
+ " 1794.331000 | \n",
+ " 0.054021 | \n",
+ " 12.600000 | \n",
+ " 1999.000000 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 185.855600 | \n",
+ " 3101.296400 | \n",
+ " 0.094037 | \n",
+ " 16.750000 | \n",
+ " 2004.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 266.888400 | \n",
+ " 13086.964800 | \n",
+ " 0.328391 | \n",
+ " 21.350000 | \n",
+ " 2009.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item_MRP Item_Outlet_Sales Item_Visibility Item_Weight \\\n",
+ "count 14204.000000 8523.000000 14204.000000 11765.000000 \n",
+ "mean 141.004977 2181.288914 0.065953 12.792854 \n",
+ "std 62.086938 1706.499616 0.051459 4.652502 \n",
+ "min 31.290000 33.290000 0.000000 4.555000 \n",
+ "25% 94.012000 834.247400 0.027036 8.710000 \n",
+ "50% 142.247000 1794.331000 0.054021 12.600000 \n",
+ "75% 185.855600 3101.296400 0.094037 16.750000 \n",
+ "max 266.888400 13086.964800 0.328391 21.350000 \n",
+ "\n",
+ " Outlet_Establishment_Year \n",
+ "count 14204.000000 \n",
+ "mean 1997.830681 \n",
+ "std 8.371664 \n",
+ "min 1985.000000 \n",
+ "25% 1987.000000 \n",
+ "50% 1999.000000 \n",
+ "75% 2004.000000 \n",
+ "max 2009.000000 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Numerical data summary:\n",
+ "data.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Data Cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Item_Fat_Content 0\n",
+ "Item_Identifier 0\n",
+ "Item_MRP 0\n",
+ "Item_Outlet_Sales 5681\n",
+ "Item_Type 0\n",
+ "Item_Visibility 0\n",
+ "Item_Weight 2439\n",
+ "Outlet_Establishment_Year 0\n",
+ "Outlet_Identifier 0\n",
+ "Outlet_Location_Type 0\n",
+ "Outlet_Size 4016\n",
+ "Outlet_Type 0\n",
+ "source 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Check missing values:\n",
+ "data.apply(lambda x: sum(x.isnull()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Filling missing values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Medium 4655\n",
+ "Small 3980\n",
+ "High 1553\n",
+ "Name: Outlet_Size, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['Outlet_Size'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data.Outlet_Size = data.Outlet_Size.fillna('Medium')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Item_Fat_Content 0\n",
+ "Item_Identifier 0\n",
+ "Item_MRP 0\n",
+ "Item_Outlet_Sales 0\n",
+ "Item_Type 0\n",
+ "Item_Visibility 0\n",
+ "Item_Weight 0\n",
+ "Outlet_Establishment_Year 0\n",
+ "Outlet_Identifier 0\n",
+ "Outlet_Location_Type 0\n",
+ "Outlet_Size 0\n",
+ "Outlet_Type 0\n",
+ "source 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.apply(lambda x: sum(x.isnull()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 14204 entries, 0 to 14203\n",
+ "Data columns (total 13 columns):\n",
+ "Item_Fat_Content 14204 non-null object\n",
+ "Item_Identifier 14204 non-null object\n",
+ "Item_MRP 14204 non-null float64\n",
+ "Item_Outlet_Sales 14204 non-null float64\n",
+ "Item_Type 14204 non-null object\n",
+ "Item_Visibility 14204 non-null float64\n",
+ "Item_Weight 14204 non-null float64\n",
+ "Outlet_Establishment_Year 14204 non-null int64\n",
+ "Outlet_Identifier 14204 non-null object\n",
+ "Outlet_Location_Type 14204 non-null object\n",
+ "Outlet_Size 14204 non-null object\n",
+ "Outlet_Type 14204 non-null object\n",
+ "source 14204 non-null object\n",
+ "dtypes: float64(4), int64(1), object(8)\n",
+ "memory usage: 1.4+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Food 10201\n",
+ "Non-Consumable 2686\n",
+ "Drinks 1317\n",
+ "Name: Item_Type_Combined, dtype: int64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Item type combine:\n",
+ "data['Item_Identifier'].value_counts()\n",
+ "data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])\n",
+ "data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',\n",
+ " 'NC':'Non-Consumable',\n",
+ " 'DR':'Drinks'})\n",
+ "data['Item_Type_Combined'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Numerical and One-Hot Coding of Categorical variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Import library:\n",
+ "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
+ "le = LabelEncoder()\n",
+ "#New variable for outlet\n",
+ "data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])\n",
+ "var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']\n",
+ "le = LabelEncoder()\n",
+ "for i in var_mod:\n",
+ " data[i] = le.fit_transform(data[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#One Hot Coding:\n",
+ "data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item_Identifier | \n",
+ " Item_MRP | \n",
+ " Item_Outlet_Sales | \n",
+ " Item_Type | \n",
+ " Item_Visibility | \n",
+ " Item_Weight | \n",
+ " Outlet_Establishment_Year | \n",
+ " Outlet_Identifier | \n",
+ " source | \n",
+ " Item_Fat_Content_0 | \n",
+ " ... | \n",
+ " Outlet_0 | \n",
+ " Outlet_1 | \n",
+ " Outlet_2 | \n",
+ " Outlet_3 | \n",
+ " Outlet_4 | \n",
+ " Outlet_5 | \n",
+ " Outlet_6 | \n",
+ " Outlet_7 | \n",
+ " Outlet_8 | \n",
+ " Outlet_9 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " FDA15 | \n",
+ " 249.8092 | \n",
+ " 3735.1380 | \n",
+ " Dairy | \n",
+ " 0.016047 | \n",
+ " 9.30 | \n",
+ " 1999 | \n",
+ " OUT049 | \n",
+ " train | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " DRC01 | \n",
+ " 48.2692 | \n",
+ " 443.4228 | \n",
+ " Soft Drinks | \n",
+ " 0.019278 | \n",
+ " 5.92 | \n",
+ " 2009 | \n",
+ " OUT018 | \n",
+ " train | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " FDN15 | \n",
+ " 141.6180 | \n",
+ " 2097.2700 | \n",
+ " Meat | \n",
+ " 0.016760 | \n",
+ " 17.50 | \n",
+ " 1999 | \n",
+ " OUT049 | \n",
+ " train | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " FDX07 | \n",
+ " 182.0950 | \n",
+ " 732.3800 | \n",
+ " Fruits and Vegetables | \n",
+ " 0.000000 | \n",
+ " 19.20 | \n",
+ " 1998 | \n",
+ " OUT010 | \n",
+ " train | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " NCD19 | \n",
+ " 53.8614 | \n",
+ " 994.7052 | \n",
+ " Household | \n",
+ " 0.000000 | \n",
+ " 8.93 | \n",
+ " 1987 | \n",
+ " OUT013 | \n",
+ " train | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 37 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item_Identifier Item_MRP Item_Outlet_Sales Item_Type \\\n",
+ "0 FDA15 249.8092 3735.1380 Dairy \n",
+ "1 DRC01 48.2692 443.4228 Soft Drinks \n",
+ "2 FDN15 141.6180 2097.2700 Meat \n",
+ "3 FDX07 182.0950 732.3800 Fruits and Vegetables \n",
+ "4 NCD19 53.8614 994.7052 Household \n",
+ "\n",
+ " Item_Visibility Item_Weight Outlet_Establishment_Year Outlet_Identifier \\\n",
+ "0 0.016047 9.30 1999 OUT049 \n",
+ "1 0.019278 5.92 2009 OUT018 \n",
+ "2 0.016760 17.50 1999 OUT049 \n",
+ "3 0.000000 19.20 1998 OUT010 \n",
+ "4 0.000000 8.93 1987 OUT013 \n",
+ "\n",
+ " source Item_Fat_Content_0 ... Outlet_0 Outlet_1 Outlet_2 \\\n",
+ "0 train 0 ... 0 0 0 \n",
+ "1 train 0 ... 0 0 0 \n",
+ "2 train 0 ... 0 0 0 \n",
+ "3 train 0 ... 1 0 0 \n",
+ "4 train 0 ... 0 1 0 \n",
+ "\n",
+ " Outlet_3 Outlet_4 Outlet_5 Outlet_6 Outlet_7 Outlet_8 Outlet_9 \n",
+ "0 0 0 0 0 0 0 1 \n",
+ "1 1 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 1 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 \n",
+ "\n",
+ "[5 rows x 37 columns]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Item_Identifier object\n",
+ "Item_MRP float64\n",
+ "Item_Outlet_Sales float64\n",
+ "Item_Type object\n",
+ "Item_Visibility float64\n",
+ "Item_Weight float64\n",
+ "Outlet_Establishment_Year int64\n",
+ "Outlet_Identifier object\n",
+ "source object\n",
+ "Item_Fat_Content_0 uint8\n",
+ "Item_Fat_Content_1 uint8\n",
+ "Item_Fat_Content_2 uint8\n",
+ "Item_Fat_Content_3 uint8\n",
+ "Item_Fat_Content_4 uint8\n",
+ "Outlet_Location_Type_0 uint8\n",
+ "Outlet_Location_Type_1 uint8\n",
+ "Outlet_Location_Type_2 uint8\n",
+ "Outlet_Size_0 uint8\n",
+ "Outlet_Size_1 uint8\n",
+ "Outlet_Size_2 uint8\n",
+ "Outlet_Type_0 uint8\n",
+ "Outlet_Type_1 uint8\n",
+ "Outlet_Type_2 uint8\n",
+ "Outlet_Type_3 uint8\n",
+ "Item_Type_Combined_0 uint8\n",
+ "Item_Type_Combined_1 uint8\n",
+ "Item_Type_Combined_2 uint8\n",
+ "Outlet_0 uint8\n",
+ "Outlet_1 uint8\n",
+ "Outlet_2 uint8\n",
+ "Outlet_3 uint8\n",
+ "Outlet_4 uint8\n",
+ "Outlet_5 uint8\n",
+ "Outlet_6 uint8\n",
+ "Outlet_7 uint8\n",
+ "Outlet_8 uint8\n",
+ "Outlet_9 uint8\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Exporting Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "#Drop the columns which have been converted to different types:\n",
+ "data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)\n",
+ "\n",
+ "#Divide into test and train:\n",
+ "train = data.loc[data['source']==\"train\"]\n",
+ "test = data.loc[data['source']==\"test\"]\n",
+ "\n",
+ "#Drop unnecessary columns:\n",
+ "test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)\n",
+ "train.drop(['source'],axis=1,inplace=True)\n",
+ "\n",
+ "#Export files as modified versions:\n",
+ "train.to_csv(\"train_modified.csv\",index=False)\n",
+ "test.to_csv(\"test_modified.csv\",index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Model Building"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Reading modified data\n",
+ "train2 = pd.read_csv(\"train_modified.csv\")\n",
+ "test2 = pd.read_csv(\"test_modified.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item_Identifier | \n",
+ " Item_MRP | \n",
+ " Item_Outlet_Sales | \n",
+ " Item_Visibility | \n",
+ " Item_Weight | \n",
+ " Outlet_Identifier | \n",
+ " Item_Fat_Content_0 | \n",
+ " Item_Fat_Content_1 | \n",
+ " Item_Fat_Content_2 | \n",
+ " Item_Fat_Content_3 | \n",
+ " ... | \n",
+ " Outlet_0 | \n",
+ " Outlet_1 | \n",
+ " Outlet_2 | \n",
+ " Outlet_3 | \n",
+ " Outlet_4 | \n",
+ " Outlet_5 | \n",
+ " Outlet_6 | \n",
+ " Outlet_7 | \n",
+ " Outlet_8 | \n",
+ " Outlet_9 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " FDA15 | \n",
+ " 249.8092 | \n",
+ " 3735.1380 | \n",
+ " 0.016047 | \n",
+ " 9.30 | \n",
+ " OUT049 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " DRC01 | \n",
+ " 48.2692 | \n",
+ " 443.4228 | \n",
+ " 0.019278 | \n",
+ " 5.92 | \n",
+ " OUT018 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " FDN15 | \n",
+ " 141.6180 | \n",
+ " 2097.2700 | \n",
+ " 0.016760 | \n",
+ " 17.50 | \n",
+ " OUT049 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " FDX07 | \n",
+ " 182.0950 | \n",
+ " 732.3800 | \n",
+ " 0.000000 | \n",
+ " 19.20 | \n",
+ " OUT010 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " NCD19 | \n",
+ " 53.8614 | \n",
+ " 994.7052 | \n",
+ " 0.000000 | \n",
+ " 8.93 | \n",
+ " OUT013 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 34 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item_Identifier Item_MRP Item_Outlet_Sales Item_Visibility Item_Weight \\\n",
+ "0 FDA15 249.8092 3735.1380 0.016047 9.30 \n",
+ "1 DRC01 48.2692 443.4228 0.019278 5.92 \n",
+ "2 FDN15 141.6180 2097.2700 0.016760 17.50 \n",
+ "3 FDX07 182.0950 732.3800 0.000000 19.20 \n",
+ "4 NCD19 53.8614 994.7052 0.000000 8.93 \n",
+ "\n",
+ " Outlet_Identifier Item_Fat_Content_0 Item_Fat_Content_1 \\\n",
+ "0 OUT049 0 1 \n",
+ "1 OUT018 0 0 \n",
+ "2 OUT049 0 1 \n",
+ "3 OUT010 0 0 \n",
+ "4 OUT013 0 1 \n",
+ "\n",
+ " Item_Fat_Content_2 Item_Fat_Content_3 ... Outlet_0 Outlet_1 \\\n",
+ "0 0 0 ... 0 0 \n",
+ "1 1 0 ... 0 0 \n",
+ "2 0 0 ... 0 0 \n",
+ "3 1 0 ... 1 0 \n",
+ "4 0 0 ... 0 1 \n",
+ "\n",
+ " Outlet_2 Outlet_3 Outlet_4 Outlet_5 Outlet_6 Outlet_7 Outlet_8 \\\n",
+ "0 0 0 0 0 0 0 0 \n",
+ "1 0 1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 \n",
+ "\n",
+ " Outlet_9 \n",
+ "0 1 \n",
+ "1 0 \n",
+ "2 1 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 34 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train = train2.drop(['Item_Outlet_Sales', 'Outlet_Identifier','Item_Identifier'], axis=1)\n",
+ "y_train = train2.Item_Outlet_Sales"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_test = test2.drop(['Outlet_Identifier','Item_Identifier'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Item_MRP | \n",
+ " Item_Visibility | \n",
+ " Item_Weight | \n",
+ " Item_Fat_Content_0 | \n",
+ " Item_Fat_Content_1 | \n",
+ " Item_Fat_Content_2 | \n",
+ " Item_Fat_Content_3 | \n",
+ " Item_Fat_Content_4 | \n",
+ " Outlet_Location_Type_0 | \n",
+ " Outlet_Location_Type_1 | \n",
+ " ... | \n",
+ " Outlet_0 | \n",
+ " Outlet_1 | \n",
+ " Outlet_2 | \n",
+ " Outlet_3 | \n",
+ " Outlet_4 | \n",
+ " Outlet_5 | \n",
+ " Outlet_6 | \n",
+ " Outlet_7 | \n",
+ " Outlet_8 | \n",
+ " Outlet_9 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 249.8092 | \n",
+ " 0.016047 | \n",
+ " 9.30 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 48.2692 | \n",
+ " 0.019278 | \n",
+ " 5.92 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 141.6180 | \n",
+ " 0.016760 | \n",
+ " 17.50 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 182.0950 | \n",
+ " 0.000000 | \n",
+ " 19.20 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 53.8614 | \n",
+ " 0.000000 | \n",
+ " 8.93 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 31 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Item_MRP Item_Visibility Item_Weight Item_Fat_Content_0 \\\n",
+ "0 249.8092 0.016047 9.30 0 \n",
+ "1 48.2692 0.019278 5.92 0 \n",
+ "2 141.6180 0.016760 17.50 0 \n",
+ "3 182.0950 0.000000 19.20 0 \n",
+ "4 53.8614 0.000000 8.93 0 \n",
+ "\n",
+ " Item_Fat_Content_1 Item_Fat_Content_2 Item_Fat_Content_3 \\\n",
+ "0 1 0 0 \n",
+ "1 0 1 0 \n",
+ "2 1 0 0 \n",
+ "3 0 1 0 \n",
+ "4 1 0 0 \n",
+ "\n",
+ " Item_Fat_Content_4 Outlet_Location_Type_0 Outlet_Location_Type_1 \\\n",
+ "0 0 1 0 \n",
+ "1 0 0 0 \n",
+ "2 0 1 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " ... Outlet_0 Outlet_1 Outlet_2 Outlet_3 Outlet_4 Outlet_5 \\\n",
+ "0 ... 0 0 0 0 0 0 \n",
+ "1 ... 0 0 0 1 0 0 \n",
+ "2 ... 0 0 0 0 0 0 \n",
+ "3 ... 1 0 0 0 0 0 \n",
+ "4 ... 0 1 0 0 0 0 \n",
+ "\n",
+ " Outlet_6 Outlet_7 Outlet_8 Outlet_9 \n",
+ "0 0 0 0 1 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 1 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ "[5 rows x 31 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 3735.1380\n",
+ "1 443.4228\n",
+ "2 2097.2700\n",
+ "3 732.3800\n",
+ "4 994.7052\n",
+ "Name: Item_Outlet_Sales, dtype: float64"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Linear Regression Model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Fitting Multiple Linear Regression to the training set\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "regressor = LinearRegression()\n",
+ "regressor.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Predicting the test set results\n",
+ "y_pred = regressor.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,\n",
+ " 3565.6645235 , 1267.46171871])"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "# Measuring Accuracy\n",
+ "from sklearn.metrics import accuracy_score, r2_score, mean_squared_error\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "from sklearn import cross_validation, metrics\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "56.36"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lr_accuracy = round(regressor.score(X_train,y_train) * 100,2)\n",
+ "lr_accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.563589277727048"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2_score(y_train, regressor.predict(X_train))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "#Perform cross-validation:\n",
+ "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1150.93927648 1118.68414103 1112.89657923 1126.30724065 1140.59735737]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(np.sqrt(np.abs(cv_score)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSE : 1127\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission = pd.DataFrame({\n",
+ "'Item_Identifier':test2['Item_Identifier'],\n",
+ "'Outlet_Identifier':test2['Outlet_Identifier'],\n",
+ "'Item_Outlet_Sales': y_pred\n",
+ "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission.to_csv('submission1.csv',index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Decision Tree Model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DecisionTreeRegressor(criterion='mse', max_depth=15, max_features=None,\n",
+ " max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
+ " min_impurity_split=None, min_samples_leaf=300,\n",
+ " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
+ " presort=False, random_state=None, splitter='best')"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Fitting Decision Tree Regression to the dataset\n",
+ "from sklearn.tree import DecisionTreeRegressor\n",
+ "regressor = DecisionTreeRegressor(max_depth=15,min_samples_leaf=300)\n",
+ "regressor.fit(X_train, y_train)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([1673.98398729, 1349.51290433, 471.30684669, ..., 1892.06614452,\n",
+ " 3805.94860417, 1349.51290433])"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Predicting the test set results\n",
+ "y_pred = regressor.predict(X_test)\n",
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.59"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree_accuracy = round(regressor.score(X_train,y_train),2)\n",
+ "tree_accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.5884050821570486"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2_score(y_train, regressor.predict(X_train))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1138.77137157 1109.42501179 1145.66395939 1113.2648073 1129.0816826 ]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n",
+ "print(np.sqrt(np.abs(cv_score)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSE : 1095\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission = pd.DataFrame({\n",
+ "'Item_Identifier':test2['Item_Identifier'],\n",
+ "'Outlet_Identifier':test2['Outlet_Identifier'],\n",
+ "'Item_Outlet_Sales': y_pred\n",
+ "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission.to_csv('submission2.csv',index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Random Forest Model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,\n",
+ " max_features='auto', max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=50, min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,\n",
+ " oob_score=False, random_state=None, verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Fitting Random Forest Regression to the dataset\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "regressor = RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=50,n_jobs=4)\n",
+ "regressor.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([1643.87106725, 1364.24193091, 603.09113992, ..., 1957.62183676,\n",
+ " 3698.60040819, 1290.25320329])"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Predicting the test set results\n",
+ "y_pred = regressor.predict(X_test)\n",
+ "y_pred"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.61"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rf_accuracy = round(regressor.score(X_train,y_train),2)\n",
+ "rf_accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6125814698282157"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "r2_score(y_train, regressor.predict(X_train))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1100.46298396 1077.70836131 1077.65325884 1069.0502564 1083.85364282]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='mean_squared_error')\n",
+ "print(np.sqrt(np.abs(cv_score)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSE : 1062\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"RMSE : %.4g\" % np.sqrt(metrics.mean_squared_error(y_train, regressor.predict(X_train))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission = pd.DataFrame({\n",
+ "'Item_Identifier':test2['Item_Identifier'],\n",
+ "'Outlet_Identifier':test2['Outlet_Identifier'],\n",
+ "'Item_Outlet_Sales': y_pred\n",
+ "},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission.to_csv('submission3.csv',index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}