In [1]:
import numpy as np
import pandas as pd

# Pandas Series

**Creating a Series**

In [2]:
# Creating indexes
indexes = ["USA", "Canada", "Mexico"]

# Getting Data
my_data = [1776, 1867, 1821]

# Creating a Pandas Series
series = pd.Series(my_data, indexes)

print(type(series))

print(series["USA"])

<class 'pandas.core.series.Series'>
1776


**Converting a Python Dictionary to a Series**

In [3]:
# Getting a dictionary
dictionary = {"Car":1000, "Gas":10, "Spike":7}

# Converting it to a series
series2 = pd.Series(dictionary)

# Testing
print(series2["Car"])

1000


**Getting keys of the series and broadcasting values**

In [4]:
print(series2.keys())

# As pandas is built on numpy, we can perform broadcasting too

print(series2*2)

# Similarly, we can perform all the mathematical operations on it as we can on np arrays

Index(['Car', 'Gas', 'Spike'], dtype='object')
Car      2000
Gas        20
Spike      14
dtype: int64


# Pandas DataFrame

**Creating a DataFrame**

In [5]:
# A DataFrame is a several series that shares the same index

# Creating a DataFrame

np.random.seed(101)
mydata = np.random.randint(0,101,(4,3))

print("My Data:\n",mydata, "\n")

index = ['CA', 'NY', 'AZ', 'TX']
columns = ['JAN', 'FEB', 'MARCH']
df = pd.DataFrame(data=mydata, index=index, columns=columns)

print("DataFrame:\n",df, "\n")

My Data:
 [[95 11 81]
 [70 63 87]
 [75  9 77]
 [40  4 63]] 

DataFrame:
     JAN  FEB  MARCH
CA   95   11     81
NY   70   63     87
AZ   75    9     77
TX   40    4     63 



In [6]:
# Printing a CSV File

df = pd.read_csv('/kaggle/input/tipscsv/tips.csv')

print(df)

     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]


In [7]:
# Getting some information about the csv file

print("Columns:\n", df.columns, "\n")
print("Indices:\n", df.index, "\n")

# The column array shows the names of the columns
# The indices array  shows that the index started at 0, ended at 244, and each index incremented with step 1


Columns:
 Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object') 

Indices:
 RangeIndex(start=0, stop=244, step=1) 



In [9]:
# Getting some first and last rows i.e. head and tail

# Getting the first 10 rows of the csv file
print("The first 10 rows of the csv file:\n", df.head(10), "\n")

# Getting the last 10 rows of the csv file
print("The last 10 rows of the csv file:\n", df.tail(10), "\n")

# Getting the the description of the csv file
# It gives the statistical description of the columns that contains numbers
print("Description of the CSV File:\n", df.describe(), "\n")

The first 10 rows of the csv file:
    total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2 

The last 10 rows of the csv file:
      total_bill   tip     sex smoker   day    time  size
234       15.53  3.00    Male    Yes   Sat  Dinner     2
235       10.07  1.25    Male     No   Sat  Dinner     2
236       12.60  1.00    Male    Yes   Sat  Dinner     2
237       32.83  1.17    Male    Yes   Sat  Dinner     2
238       35.83  4.67  Female     No   Sat  Dinn

In [12]:
# Printing some of the selected columns of the CSV file
cols = ['total_bill', 'tip']
print("Specific Columns:\n", df[cols], "\n")

# Adding two columns
print("Sum:\n", df['tip'] + df['total_bill'], "n")

# Similarly, we cna perform different mathematical operation on any two or more columns

Specific Columns:
      total_bill   tip
0         16.99  1.01
1         10.34  1.66
2         21.01  3.50
3         23.68  3.31
4         24.59  3.61
..          ...   ...
239       29.03  5.92
240       27.18  2.00
241       22.67  2.00
242       17.82  1.75
243       18.78  3.00

[244 rows x 2 columns] 

Sum:
 0      18.00
1      12.00
2      24.51
3      26.99
4      28.20
       ...  
239    34.95
240    29.18
241    24.67
242    19.57
243    21.78
Length: 244, dtype: float64 n


In [14]:
# Overwriting a column

df['price_per_person'] = df['total_bill'] / df['size']

print("Updated Price Per Person:\n", df['price_per_person'], "\n")

Updated Price Per Person:
 0       8.495000
1       3.446667
2       7.003333
3      11.840000
4       6.147500
         ...    
239     9.676667
240    13.590000
241    11.335000
242     8.910000
243     9.390000
Name: price_per_person, Length: 244, dtype: float64 



In [22]:
# Printing information of a row. 0 indicated the index i.e. row 0
print(df.iloc[0])

# It returns the value contained in a row for each column

total_bill           16.99
tip                   1.01
sex                 Female
smoker                  No
day                    Sun
size                     2
price_per_person     8.495
Name: 0, dtype: object


**Conditional Filtering**

In [27]:
# If we want to select rows from our DF, based on a condition, we used conditional filtering
# As we might not know that the data placed in row e.g. 100 meets our condition, so we cannot perform indexing i.e. iloc. Thus we use conditional filtering

print(df['total_bill']>40)

# It shows which rows has total_bill greater than 40, to return those rows, we use the following operation

bool_series = df['total_bill']>40
print("\n")

print("Rows having total_biill > 40:\n", df[bool_series], "\n")

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Name: total_bill, Length: 244, dtype: bool


Rows having total_biill > 40:
      total_bill    tip     sex smoker   day  size  price_per_person
59        48.27   6.73    Male     No   Sat     4         12.067500
95        40.17   4.73    Male    Yes   Fri     4         10.042500
102       44.30   2.50  Female    Yes   Sat     3         14.766667
142       41.19   5.00    Male     No  Thur     5          8.238000
156       48.17   5.00    Male     No   Sun     6          8.028333
170       50.81  10.00    Male    Yes   Sat     3         16.936667
182       45.35   3.50    Male    Yes   Sun     3         15.116667
184       40.55   3.00    Male    Yes   Sun     2         20.275000
197       43.11   5.00  Female    Yes  Thur     4         10.777500
212       48.33   9.00    Male     No   Sat     4         12.082500 



In [29]:
# For practice, we are gonna do it on string or boolean columns
# Below is just another way of returning the rows with conditional filtering

print(df[df['sex'] == 'Male'])

# This returns all the rows where sex = male

     total_bill   tip   sex smoker  day  size  price_per_person
1         10.34  1.66  Male     No  Sun     3          3.446667
2         21.01  3.50  Male     No  Sun     3          7.003333
3         23.68  3.31  Male     No  Sun     2         11.840000
5         25.29  4.71  Male     No  Sun     4          6.322500
6          8.77  2.00  Male     No  Sun     2          4.385000
..          ...   ...   ...    ...  ...   ...               ...
236       12.60  1.00  Male    Yes  Sat     2          6.300000
237       32.83  1.17  Male    Yes  Sat     2         16.415000
239       29.03  5.92  Male     No  Sat     3          9.676667
241       22.67  2.00  Male    Yes  Sat     2         11.335000
242       17.82  1.75  Male     No  Sat     2          8.910000

[157 rows x 7 columns]
