In [1]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = sns.load_dataset('tips')
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
data.ndim

2

In [4]:
data.shape

(244, 7)

In [5]:
data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [6]:
data.duplicated().sum()

1

In [7]:
data.drop_duplicates(inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  243 non-null    float64 
 1   tip         243 non-null    float64 
 2   sex         243 non-null    category
 3   smoker      243 non-null    category
 4   day         243 non-null    category
 5   time        243 non-null    category
 6   size        243 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 9.1 KB


In [9]:
data.describe()

Unnamed: 0,total_bill,tip,size
count,243.0,243.0,243.0
mean,19.813868,3.002387,2.572016
std,8.910071,1.385002,0.952356
min,3.07,1.0,1.0
25%,13.38,2.0,2.0
50%,17.81,2.92,2.0
75%,24.175,3.575,3.0
max,50.81,10.0,6.0


In [10]:
ordinalEncoder = OrdinalEncoder()

In [11]:
columnsToEncode = ['sex', 'smoker', 'day']
for col in columnsToEncode:
    data[col] = ordinalEncoder.fit_transform(data[[col]])

In [12]:
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0.0,0.0,2.0,Dinner,2
1,10.34,1.66,1.0,0.0,2.0,Dinner,3
2,21.01,3.50,1.0,0.0,2.0,Dinner,3
3,23.68,3.31,1.0,0.0,2.0,Dinner,2
4,24.59,3.61,0.0,0.0,2.0,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1.0,0.0,1.0,Dinner,3
240,27.18,2.00,0.0,1.0,1.0,Dinner,2
241,22.67,2.00,1.0,1.0,1.0,Dinner,2
242,17.82,1.75,1.0,0.0,1.0,Dinner,2


In [13]:
labelEncoder = LabelEncoder()

In [14]:
data['time'] = labelEncoder.fit_transform(data['time'])

In [15]:
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0.0,0.0,2.0,0,2
1,10.34,1.66,1.0,0.0,2.0,0,3
2,21.01,3.50,1.0,0.0,2.0,0,3
3,23.68,3.31,1.0,0.0,2.0,0,2
4,24.59,3.61,0.0,0.0,2.0,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1.0,0.0,1.0,0,3
240,27.18,2.00,0.0,1.0,1.0,0,2
241,22.67,2.00,1.0,1.0,1.0,0,2
242,17.82,1.75,1.0,0.0,1.0,0,2


In [16]:
robustScaler = RobustScaler()

In [17]:
columnsToScale = ['total_bill', 'tip', 'size']
for col in columnsToScale:
    data[col] = robustScaler.fit_transform(data[[col]])

In [18]:
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,-0.075961,-1.212698,0.0,0.0,2.0,0,0.0
1,-0.691987,-0.800000,1.0,0.0,2.0,0,1.0
2,0.296434,0.368254,1.0,0.0,2.0,0,1.0
3,0.543770,0.247619,1.0,0.0,2.0,0,0.0
4,0.628069,0.438095,0.0,0.0,2.0,0,2.0
...,...,...,...,...,...,...,...
239,1.039370,1.904762,1.0,0.0,1.0,0,1.0
240,0.867994,-0.584127,0.0,1.0,1.0,0,0.0
241,0.450208,-0.584127,1.0,1.0,1.0,0,0.0
242,0.000926,-0.742857,1.0,0.0,1.0,0,0.0


In [19]:
X = data.drop(columns='time')
y = data.time

In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=32)

In [21]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((194, 6), (49, 6), (194,), (49,))

In [22]:
logisticRegression = LogisticRegression()

In [23]:
logisticRegression.fit(xtrain, ytrain)

In [24]:
yprediction = logisticRegression.predict(xtest)

In [25]:
confusion_matrix(ytest, yprediction)

array([[34,  0],
       [ 3, 12]], dtype=int64)

In [26]:
accuracy_score(ytest, yprediction)

0.9387755102040817

In [27]:
print(classification_report(ytest, yprediction))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        34
           1       1.00      0.80      0.89        15

    accuracy                           0.94        49
   macro avg       0.96      0.90      0.92        49
weighted avg       0.94      0.94      0.94        49

