# Load the dataset
- Load the Tips dataset using pandas.


In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("/content/tips.csv")
display(df)

# Basic data exploration
- Display the first 5 rows of the dataset.
- Display the summary statistics of the dataset.
- Check for missing values in the dataset.

In [None]:
display(df.head(5))
display(df.info())
display(df.isnull().sum())

# Data selection
- Select the columns 'total_bill', 'tip', and 'sex'.
- Select the rows where the 'total_bill' is greater than $20.
- Select the rows where the day is 'Sun' and the 'time' is 'Dinner'.


In [None]:
selected_columns = df[["total_bill","tip","sex"]]
display(selected_columns)
#----------------------------------
display(df[df["total_bill"]>20])
#------------------------------------------
display(df[(df['day'] == 'Sun') & (df['time'] == 'Dinner')])


# Grouping and aggregation:

- Calculate the average total bill for each day.
- Calculate the sum of tips for each day.
- Find the maximum total bill for each combination of day and time.

In [None]:
display(df.groupby('day')['total_bill'].mean())
#-------------------------------------------------------------
display(df.groupby('day')['tip'].sum())
#-----------------------------------------------------
display(df.groupby(['day', 'time'])['total_bill'].max())

# Data transformation:

- Add a new column 'tip_percentage' which is the tip divided by the total bill multiplied by 100.
- Create a new column 'bill_per_person' which is the total bill divided by the number of people ('size').

In [None]:
df['tip_percentage'] = (df['tip'] / df['total_bill']) * 100
display(df)
#----------------------------------------------
df['bill_per_person'] = (df['total_bill'] / df['size'])
display(df)

# Filtering:

- Filter the rows where 'tip_percentage' is greater than 15%.
- Filter the rows where 'total_bill' is in the top 10 highest bills.

In [None]:
filtered_rows=df[df["tip_percentage"]>15]
display(filtered_rows)
#----------------------------------------------
top_10_bills = df['total_bill'].nlargest(10)
filtered_df = df[df['total_bill'].isin(top_10_bills)]
print("Top 10 Total Bills:")
display(filtered_df)



# Sorting:

- Sort the dataset by 'total_bill' in descending order.

In [None]:
sorted=df.sort_values(by="total_bill",ascending=False)
display(sorted)
#display(df[df["total_bill"].isin(first_sorted)])

# Handling missing data:

- Replace any missing values in the 'tip' column with the mean of the 'tip' column.
- Drop any rows where the 'size' is missing.

In [None]:
tip_mean=df["tip"].mean()
print(tip_mean)
df["tip"].fillna(tip_mean,inplace=True)
display(df)
#---------------------------------------------------
df.dropna(subset=["size"],inplace=True)
display(df)