## Loading Data from CSV file

In [1]:
import pandas as pd
data = pd.read_csv('sales_returns.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,cust_age,cust_gender,...,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit,returned
0,18,19,CA-2015-143336,27/08/15,01/09/15,Second Class,ZD-21925,Zuschuss Donatelli,19,Female,...,West,OFF-AR-10003056,Office Supplies,Art,Newell 341,8.56,2,0.0,2.4824,True
1,19,20,CA-2015-143336,27/08/15,01/09/15,Second Class,ZD-21925,Zuschuss Donatelli,19,Female,...,West,TEC-PH-10001949,Technology,Phones,Cisco SPA 501G IP Phone,213.48,3,0.2,16.011,True
2,20,21,CA-2015-143336,27/08/15,01/09/15,Second Class,ZD-21925,Zuschuss Donatelli,19,Female,...,West,OFF-BI-10002215,Office Supplies,Binders,"Wilson Jones Hanging View Binder, White, 1""",22.72,4,0.2,7.384,True
3,55,56,CA-2017-111682,17/06/17,18/06/17,First Class,TB-21055,Ted Butterfield,20,Female,...,East,OFF-ST-10000604,Office Supplies,Storage,Home/Office Personal File Carts,208.56,6,0.0,52.14,True
4,56,57,CA-2017-111682,17/06/17,18/06/17,First Class,TB-21055,Ted Butterfield,20,Female,...,East,OFF-PA-10001569,Office Supplies,Paper,Xerox 232,32.4,5,0.0,15.552,True


## Selecting Input Variables ( Predictors)

In [4]:
X = data.loc[:,['cust_age','cust_gender','city','sub_category']]
y = data.returned
X.head()

Unnamed: 0,cust_age,cust_gender,city,sub_category
0,19,Female,San Francisco,Art
1,19,Female,San Francisco,Phones
2,19,Female,San Francisco,Binders
3,20,Female,Troy,Storage
4,20,Female,Troy,Paper


In [6]:
y.head()

0    True
1    True
2    True
3    True
4    True
Name: returned, dtype: bool

## Data Preparation

In [8]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X.cust_gender = enc.fit_transform(X.cust_gender)
X.city = enc.fit_transform(X.city)
X.sub_category = enc.fit_transform(X.sub_category)
X.head()

Unnamed: 0,cust_age,cust_gender,city,sub_category
0,19,0,156,2
1,19,0,156,13
2,19,0,156,3
3,20,0,173,14
4,20,0,173,12


## Spliting Data to Train and Test

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

## Machine Learning Modeling - Training

In [12]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)  #training

## Machine Learning Model Evaluation - Testing

In [14]:
from sklearn.metrics import accuracy_score
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))

0.9601226993865031


## Model Evaluation - Confusion Matrix

In [None]:
pd.crosstab(y_test,y_predict)

col_0,False,True
returned,Unnamed: 1_level_1,Unnamed: 2_level_1
False,124,10
True,6,186


In [16]:
#cust_age	cust_gender	city	sub_category
# 40           Female   Dubai        Art
model.predict([[40,0,100,15]]) # cust_age, gender, city, sub_category - input features used to train the model



array([False])