# **Explore Data and Train Test Split**

You are required to

- Task 1: Import data
- Task 2: Explore data
- Task 3: Define y and X
- Task 4: Split data into train and test sample

Dataset 1  to predict survival of passengers on Titanic: https://github.com/ybifoundation/Dataset/raw/main/Titanic.csv

Dataset 2 to predict selling price of Headphones on Flipkart : https://github.com/ybifoundation/Dataset/raw/main/Flipkart%20Headphones.csv

# **😎 Datset 1**

In [13]:
# import library
import pandas as pd

In [14]:
# import data
titanic = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/Titanic.csv')

In [15]:
# first 5 lines
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [16]:
# information
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [17]:
# summary statistics of numerical variables
titanic.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [18]:
# column index
titanic.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [19]:
# dataframe shape
titanic.shape

(1309, 14)

In [20]:
# correlation
titanic.corr()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
pclass,1.0,-0.312469,-0.408106,0.060832,0.018322,-0.558629,-0.034642
survived,-0.312469,1.0,-0.055512,-0.027825,0.08266,0.244265,
age,-0.408106,-0.055512,1.0,-0.243699,-0.150917,0.17874,0.058809
sibsp,0.060832,-0.027825,-0.243699,1.0,0.373587,0.160238,-0.099961
parch,0.018322,0.08266,-0.150917,0.373587,1.0,0.221539,0.051099
fare,-0.558629,0.244265,0.17874,0.160238,0.221539,1.0,-0.04311
body,-0.034642,,0.058809,-0.099961,0.051099,-0.04311,1.0


In [21]:
# number of unique values in each column
titanic.nunique()

pclass          3
survived        2
name         1307
sex             2
age            98
sibsp           7
parch           8
ticket        929
fare          281
cabin         186
embarked        3
boat           27
body          121
home.dest     369
dtype: int64

# **Define y and X**

In [22]:
titanic.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [23]:
y = titanic['survived']

In [24]:
X = titanic[['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']]

# **Train Test Split**

In [25]:
# import function
from sklearn.model_selection import train_test_split

In [26]:
# split y and X into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [27]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((916, 13), (393, 13), (916,), (393,))

In [28]:
X_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
645,3,"Asplund, Mr. Johan Charles",male,23.0,0,0,350054,7.7958,,S,13,,"Oskarshamn, Sweden Minneapolis, MN"
482,2,"Lehmann, Miss. Bertha",female,17.0,0,0,SC 1748,12.0000,,C,12,,"Berne, Switzerland / Central City, IA"
521,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5000,F33,S,11,,"Folkstone, Kent / New York, NY"
1141,3,"Rice, Master. Albert",male,10.0,4,1,382652,29.1250,,Q,,,
88,1,"Daniels, Miss. Sarah",female,33.0,0,0,113781,151.5500,,S,8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,2,"Cook, Mrs. (Selena Rogers)",female,22.0,0,0,W./C. 14266,10.5000,F33,S,14,,Pennsylvania
943,3,"Laitinen, Miss. Kristina Sofia",female,37.0,0,0,4135,9.5875,,S,,,
740,3,"Culumovic, Mr. Jeso",male,17.0,0,0,315090,8.6625,,S,,,Austria-Hungary
399,2,"Drew, Mr. James Vivian",male,42.0,1,1,28220,32.5000,,S,,,"Greenport, NY"


# **😎 Datset 2**

In [None]:
# import library
import pandas as pd

In [32]:
# import data
headphone = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/Flipkart%20Headphones.csv')

In [33]:
# first 5 lines
headphone.head()

Unnamed: 0,Model,Company,Color,Type,Average Rating,Number of Ratings,Selling Price,Maximum Retail Price,Discount
0,5PLUS 5PHP28 Wired without Mic Headset,5PLUS,Red,On the Ear,3.6,101,496,3399,2903
1,A R Wireless compatible with Headset Bluetooth...,A R,Red,Multicolor,3.9,35280,188,799,611
2,Aerizo Wireless Touch R100 Earbuds (Black) Blu...,Aerizo,Black,True Wireless,4.0,1934,589,1298,709
3,Allmusic powerful driven bass with dynamic bea...,Allmusic,Multicolor,In the Ear,4.0,15841,260,1599,1339
4,Allmusic OPP.O Ultra HD Sound Premium Bass Spo...,Allmusic,Black,In the Ear,3.8,10766,270,999,729


In [34]:
# information
headphone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 1000 non-null   object 
 1   Company               1000 non-null   object 
 2   Color                 1000 non-null   object 
 3   Type                  1000 non-null   object 
 4   Average Rating        1000 non-null   float64
 5   Number of Ratings     1000 non-null   int64  
 6   Selling Price         1000 non-null   int64  
 7   Maximum Retail Price  1000 non-null   int64  
 8   Discount              1000 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 70.4+ KB


In [35]:
# summary statistics of numerical variables
headphone.describe()

Unnamed: 0,Average Rating,Number of Ratings,Selling Price,Maximum Retail Price,Discount
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.831,50040.75,832.875,2423.043,1590.168
std,0.467459,157229.7,812.535141,1774.025318,1341.379254
min,1.0,0.0,88.0,0.0,-2991.0
25%,3.6,116.75,349.0,1079.75,697.5
50%,3.9,1712.0,599.0,1999.0,1390.5
75%,4.0,13327.0,999.0,2999.0,2250.0
max,5.0,1299042.0,7990.0,16999.0,12000.0


In [36]:
# column index
headphone.columns

Index(['Model', 'Company', 'Color', 'Type', 'Average Rating',
       'Number of Ratings', 'Selling Price', 'Maximum Retail Price',
       'Discount'],
      dtype='object')

In [37]:
# dataframe shape
headphone.shape

(1000, 9)

In [38]:
# correlation
headphone.corr()

Unnamed: 0,Average Rating,Number of Ratings,Selling Price,Maximum Retail Price,Discount
Average Rating,1.0,0.203486,0.013916,-0.025261,-0.041838
Number of Ratings,0.203486,1.0,-0.038969,-0.033778,-0.021067
Selling Price,0.013916,-0.038969,1.0,0.696545,0.315461
Maximum Retail Price,-0.025261,-0.033778,0.696545,1.0,0.900609
Discount,-0.041838,-0.021067,0.315461,0.900609,1.0


In [39]:
# number of unique values in each column
headphone.nunique()

Model                   591
Company                 202
Color                    94
Type                     16
Average Rating           21
Number of Ratings       434
Selling Price           241
Maximum Retail Price    106
Discount                416
dtype: int64

# **Define y and X**

In [40]:
headphone.columns

Index(['Model', 'Company', 'Color', 'Type', 'Average Rating',
       'Number of Ratings', 'Selling Price', 'Maximum Retail Price',
       'Discount'],
      dtype='object')

In [45]:
y = headphone['Selling Price']

In [46]:
X = headphone[['Model', 'Company', 'Color', 'Type', 'Average Rating',
       'Number of Ratings', 'Maximum Retail Price',
       'Discount']]

# **Train Test Split**

In [47]:
# import function
from sklearn.model_selection import train_test_split

In [48]:
# split y and X into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [49]:
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700, 8), (300, 8), (700,), (300,))

In [50]:
X_train

Unnamed: 0,Model,Company,Color,Type,Average Rating,Number of Ratings,Maximum Retail Price,Discount
669,POKRYT Hot SaleASAP charging Version 5.0 Bluet...,POKRYT,Black,In the Ear,4.0,333036,1999,1100
583,N2B MAGNET-02 PACK OF 2 Bluetooth Headset,N2B,Black,True Wireless,3.5,35,1998,1744
688,"PunnkFunnk K20 Gaming Headset, Over Ear Gaming...",PunnkFunnk,Black,On the Ear,3.5,1713,2499,1550
422,Fire-Boltt BN1000 Bluetooth Headset,FIER,Black Red,In the Ear,4.0,1934,2499,1200
825,Tiitan N2 Bluetooth Headset,Tiitan,Black,In the Ear,3.6,1686,1599,1270
...,...,...,...,...,...,...,...,...
740,"SE WORLD P47 WIRELESS HEADPHONE Bluetooth, Wir...",SE,Black,On the Ear,3.8,42,1999,1600
399,FIER MAGNET-K1-I7 Pack of 3 Bluetooth Bluetoot...,FIER,Black,White,3.6,980,1999,1630
828,Togkart TWS i12 Bluetooth Headset,Togkart,Black,True Wireless,3.9,0,1499,1044
562,Mivi DuoPods M20 True Wireless Bluetooth Headset,Mivi,Blue,True Wireless,3.8,1748,2999,2200
