# Python Overview

## Functions

In [3]:
# Define a main function that prints "Greetings!" to the console
def main():
    print("Greetings!")


main()

Greetings!


In [4]:
# Define a main function that accepts a string argument
def main(stock_ticker):
    print(stock_ticker + " is booming right now!")

main('AMZN')

AMZN is booming right now!


In [5]:
# Using F string

def main(stock_ticker):
    print(f"{stock_ticker} is booming right now!")

main('AMZN')

AMZN is booming right now!


In [6]:
# Define a calculate_market_cap function that returns an integer
def calculate_market_cap(market_price, number_of_shares):
    cap = market_price * number_of_shares

    return cap


stock_ticker = "SBUX"
market_price = 76.06
number_of_shares = 1243600000

market_cap = calculate_market_cap(market_price, number_of_shares)
print(f"{stock_ticker} Market Capitalization: {market_cap}")
print(f"Data type of market_cap variable is: {type(market_cap)}")


SBUX Market Capitalization: 94588216000.0
Data type of market_cap variable is: <class 'float'>


In [8]:
# Functions can have more than one parameter
def make_quesadilla(protein, topping):
    quesadilla = f"Here is a {protein} quesadilla with {topping}"
    print(quesadilla)


# Supply the arguments (values) when calling the function
make_quesadilla("beef", "guacamole")
make_quesadilla("chicken", "salsa")

# @NOTE: Order is important when supplying arguments!
make_quesadilla("sour cream", "beef")

Here is a beef quesadilla with guacamole
Here is a chicken quesadilla with salsa
Here is a sour cream quesadilla with beef


In [9]:
# We can also specify default values for parameters
def make_quesadilla(protein, topping="sour cream"):
    quesadilla = f"Here is a {protein} quesadilla with {topping}"
    print(quesadilla)


# Make a quesadilla using the default topping
make_quesadilla("chicken")

# Make a quesadilla with a new topping
make_quesadilla("beef", "guacamole")

Here is a chicken quesadilla with sour cream
Here is a beef quesadilla with guacamole


## Functional Programming
### Lambda Function

In [11]:
# Lambda function
square = lambda x: x ** 2
print(square(5))  # Output: 25



25


### Map Function

In [12]:
# map() function
numbers = [1, 2, 3, 4, 5]
squared_numbers = list(map(lambda x: x ** 2, numbers))
print(squared_numbers)  # Output: [1, 4, 9, 16, 25]



[1, 4, 9, 16, 25]


### Filter function

In [13]:
# filter() function
even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
print(even_numbers)  # Output: [2, 4]



[2, 4]


### Reduce Function

In [14]:
# reduce() function
from functools import reduce
sum_of_numbers = reduce(lambda x, y: x + y, numbers)
print(sum_of_numbers)  # Output: 15

15


## Reading CSV

In [33]:
import csv
import itertools

with open('movieboxoffice.csv', 'r') as file:
    csvreader = csv.reader(file)

    header = next(csvreader)
    for row in itertools.islice(csvreader, 10):
        print(row)

['26-Apr-2021', 'Friday', '1', '$125,789.89 ', '', '-', '$235,036.46 ', '-46.48%', '1000', '$125,789.89 ', '1']
['27-Apr-2021', 'Saturday', '1', '$99,374.01 ', '79.00%', '-26416', '$197,622.55 ', '-49.72%', '1000', '$225,163.90 ', '2']
['28-Apr-2021', 'Sunday', '1', '$82,203.16 ', '82.72%', '-17171', '$116,991.26 ', '-29.74%', '1000', '$307,367.06 ', '3']
['29-Apr-2021', 'Monday', '1', '$33,530.26 ', '40.79%', '-48673', '$66,652.65 ', '-49.69%', '1000', '$340,897.32 ', '4']
['30-Apr-2021', 'Tuesday', '1', '$30,105.24 ', '89.79%', '-3425', '$34,828.19 ', '-13.56%', '1000', '$371,002.56 ', '5']
['1-May-2021', 'Wednesday', '1', '$22,955.06 ', '76.25%', '-7150', '$29,879.53 ', '-23.17%', '1000', '$393,957.62 ', '6']
['2-May-2021', 'Thursday', '1', '$19,579.02 ', '85.29%', '-3376', '$20,236.81 ', '-3.25%', '1000', '$413,536.63 ', '7']
['3-May-2021', 'Friday', '1', '$37,018.43 ', '189.07%', '17439', '$38,390.96 ', '-3.58%', '1000', '$450,555.06 ', '8']
['4-May-2021', 'Saturday', '1', '$55,90

In [69]:
import csv
import itertools
from datetime import datetime

def convert_row(row):
    date = datetime.strptime(row[0], '%d-%b-%Y')  # Convert date string to datetime object
    day_of_week = row[1]  # Leave as string
    rank = int(row[2])  # Convert rank to integer
    daily = float(row[3].replace('$', '').replace(',', ''))  # Convert daily earnings to float
    percent_change = float(row[4].replace('%', '').replace('-', '')) if row[4] else None  # Convert percent change to float, handle empty strings
    diff_k = int(row[5].replace('-', '')) if row[5] else None  # Convert diff to integer, handle empty strings
    forecast = float(row[6].replace('$', '').replace(',', '')) if row[6] else None  # Convert forecast to float
    percent_diff = float(row[7].replace('%', '').replace('-', '')) if row[7] else None  # Convert percent diff to float, handle empty strings
    theaters = int(row[8])  # Convert theaters to integer
    to_date = float(row[9].replace('$', '').replace(',', ''))  # Convert to date earnings to float
    row_id = int(row[10])  # Convert row ID to integer

    return [date, day_of_week, rank, daily, percent_change, diff_k, forecast, percent_diff, theaters, to_date, row_id]

with open('movieboxoffice.csv', 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    
    data = []
    for row in csvreader:
        try:
            converted_row = convert_row(row)
            data.append(converted_row)
        except ValueError as e:
            print(f"Error converting row {row}: {e}")

# Print the transformed data
# for row in data:
#     print(row)

# Quick summary and aggregation
total_daily = sum(row[3] for row in data)
average_daily = total_daily / len(data)
total_to_date = data[-1][9]  # Assuming the last row gives the cumulative earnings

print(f"Total Daily Earnings: ${total_daily:,.2f}")
print(f"Average Daily Earnings: ${average_daily:,.2f}")
print(f"Total Earnings to Date: ${total_to_date:,.2f}")


Error converting row ['26-Apr-2021', 'Friday', '1', '$125,789.89 ', '', '-', '$235,036.46 ', '-46.48%', '1000', '$125,789.89 ', '1']: invalid literal for int() with base 10: ''
Total Daily Earnings: $626,354.04
Average Daily Earnings: $4,931.92
Total Earnings to Date: $752,143.87


## Using Pandas

In [40]:
import pandas as pd

In [43]:
file = 'movieboxoffice.csv'

df = pd.read_csv(file)

df.head()

Unnamed: 0,Date,DOW,Rank,Daily,Percent Change,Diff (k),Forecast,Percent Diff,Theaters,To Date,Row ID
0,26-Apr-2021,Friday,1,"$125,789.89",,-,"$235,036.46",-46.48%,1000,"$125,789.89",1
1,27-Apr-2021,Saturday,1,"$99,374.01",79.00%,-26416,"$197,622.55",-49.72%,1000,"$225,163.90",2
2,28-Apr-2021,Sunday,1,"$82,203.16",82.72%,-17171,"$116,991.26",-29.74%,1000,"$307,367.06",3
3,29-Apr-2021,Monday,1,"$33,530.26",40.79%,-48673,"$66,652.65",-49.69%,1000,"$340,897.32",4
4,30-Apr-2021,Tuesday,1,"$30,105.24",89.79%,-3425,"$34,828.19",-13.56%,1000,"$371,002.56",5


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            128 non-null    object
 1   DOW             128 non-null    object
 2   Rank            128 non-null    int64 
 3   Daily           128 non-null    object
 4   Percent Change  127 non-null    object
 5   Diff (k)        128 non-null    object
 6   Forecast        128 non-null    object
 7   Percent Diff    128 non-null    object
 8   Theaters        128 non-null    int64 
 9   To Date         128 non-null    object
 10  Row ID          128 non-null    int64 
dtypes: int64(3), object(8)
memory usage: 11.1+ KB


In [45]:
df = pd.read_csv(file,
                 parse_dates=[0],
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
df.head()

Unnamed: 0,Date,DOW,Daily,Forecast,Percent Diff
0,2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
1,2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2,2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
3,2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
4,2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          128 non-null    datetime64[ns]
 1   DOW           128 non-null    object        
 2   Daily         128 non-null    object        
 3   Forecast      128 non-null    object        
 4   Percent Diff  128 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 5.1+ KB


In [47]:
df = pd.read_csv(file,
                 header=0,
                 parse_dates=[0],
                 index_col=0,
                 usecols=['Date',
                          'DOW',
                          'Daily',
                          'Forecast',
                          'Percent Diff'])
df.head()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DOW           128 non-null    object
 1   Daily         128 non-null    object
 2   Forecast      128 non-null    object
 3   Percent Diff  128 non-null    object
dtypes: object(4)
memory usage: 5.0+ KB


In [49]:
df.head(10)

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,"$125,789.89","$235,036.46",-46.48%
2021-04-27,Saturday,"$99,374.01","$197,622.55",-49.72%
2021-04-28,Sunday,"$82,203.16","$116,991.26",-29.74%
2021-04-29,Monday,"$33,530.26","$66,652.65",-49.69%
2021-04-30,Tuesday,"$30,105.24","$34,828.19",-13.56%
2021-05-01,Wednesday,"$22,955.06","$29,879.53",-23.17%
2021-05-02,Thursday,"$19,579.02","$20,236.81",-3.25%
2021-05-03,Friday,"$37,018.43","$38,390.96",-3.58%
2021-05-04,Saturday,"$55,900.40","$62,773.88",-10.95%
2021-05-05,Sunday,"$40,984.73","$77,932.26",-47.41%


In [51]:
clean = lambda x: x.str.replace('[^\\d]', '', regex=True)                                                      
c_df = df[['Daily', 'Forecast']].apply(clean, axis=1)
df[['Daily', 'Forecast']] = c_df.astype(float)     

In [52]:
df.head()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,Friday,12578989.0,23503646.0,-46.48%
2021-04-27,Saturday,9937401.0,19762255.0,-49.72%
2021-04-28,Sunday,8220316.0,11699126.0,-29.74%
2021-04-29,Monday,3353026.0,6665265.0,-49.69%
2021-04-30,Tuesday,3010524.0,3482819.0,-13.56%


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128 entries, 2021-04-26 to 2021-08-31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   DOW           128 non-null    object 
 1   Daily         128 non-null    float64
 2   Forecast      128 non-null    float64
 3   Percent Diff  128 non-null    object 
dtypes: float64(2), object(2)
memory usage: 5.0+ KB


In [54]:
df.memory_usage()

Index           1024
DOW             1024
Daily           1024
Forecast        1024
Percent Diff    1024
dtype: int64

In [71]:
df.describe()

Unnamed: 0,Daily,Forecast
count,128.0,128.0
mean,587612.4,936610.3
std,1755046.0,3082168.0
min,0.0,0.0
25%,5636.75,9266.75
50%,42667.5,65231.0
75%,192708.5,289381.8
max,12578990.0,23503650.0


In [75]:
df['Daily'].sum()

75214393.0

In [76]:
df['Daily'].mean()

587612.4453125

In [83]:
df.agg({'Daily': ['mean', 'sum'] , 'Forecast': ['mean', 'sum', 'min', 'max']})

Unnamed: 0,Daily,Forecast
mean,587612.4,936610.3
sum,75214390.0,119886100.0
min,,0.0
max,,23503650.0


In [84]:
df.isna()

Unnamed: 0_level_0,DOW,Daily,Forecast,Percent Diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-26,False,False,False,False
2021-04-27,False,False,False,False
2021-04-28,False,False,False,False
2021-04-29,False,False,False,False
2021-04-30,False,False,False,False
...,...,...,...,...
2021-08-27,False,False,False,False
2021-08-28,False,False,False,False
2021-08-29,False,False,False,False
2021-08-30,False,False,False,False


In [85]:
df.isna().sum()

DOW             0
Daily           0
Forecast        0
Percent Diff    0
dtype: int64

In [86]:
df.columns

Index(['DOW', 'Daily', 'Forecast', 'Percent Diff'], dtype='object')

In [87]:
df['DOW'].value_counts()

DOW
Friday       19
Saturday     19
Sunday       18
Monday       18
Tuesday      18
Wednesday    18
Thursday     18
Name: count, dtype: int64

## Reading data from GitHub

In [89]:
url = 'https://raw.githubusercontent.com/PacktPublishing/Time-Series-Analysis-with-Python-Cookbook/main/datasets/Ch2/AirQualityUCI.csv'

df = pd.read_csv(url,
                 delimiter=';',
                 parse_dates=['Date'],
                 index_col='Date')

df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10/03/2004,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
10/03/2004,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
10/03/2004,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
10/03/2004,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
10/03/2004,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


## Reading data from a Public S3 bucket

**Path-style access**

In [90]:
url = 'https://s3.us-east-1.amazonaws.com/tscookbook/AirQualityUCI.xlsx'

df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


**Virtual-hosted–style access**

In [91]:
url = 'https://tscookbook.s3.amazonaws.com/AirQualityUCI.xlsx'

df = pd.read_excel(url,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


**Accessing a bucket using S3://**

In [92]:
s3uri = 's3://tscookbook/AirQualityUCI.xlsx'
df = pd.read_excel(s3uri,
                   index_col='Date',
                   parse_dates=True)
df.head()

Unnamed: 0_level_0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


## Reading from HTML

In [93]:
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
results = pd.read_html(url)
print(len(results))

69


In [94]:
df = results[15]
df.columns

Index(['Region[30]', 'Total cases', 'Total deaths', 'Cases per million',
       'Deaths per million', 'Current weekly cases', 'Current weekly deaths',
       'Population millions', 'Vaccinated %[31]'],
      dtype='object')

In [95]:
df[['Region[30]','Total cases', 'Total deaths', 'Cases per million']].head(3)

Unnamed: 0,Region[30],Total cases,Total deaths,Cases per million
0,European Union,179537758,1185108,401363
1,North America,103783777,1133607,281404
2,Other Europe,57721948,498259,247054


In [97]:
from io import StringIO
import pandas as pd

html = """
 <table>
   <tr>
     <th>Ticker</th>
     <th>Price</th>
   </tr>
   <tr>
     <td>MSFT</td>
     <td>230</td>
   </tr>
   <tr>
     <td>APPL</td>
     <td>300</td>
   </tr>
     <tr>
     <td>MSTR</td>
     <td>120</td>
   </tr>
 </table>

 </body>
 </html>
 """

df = pd.read_html(StringIO(html))
df[0]


Unnamed: 0,Ticker,Price
0,MSFT,230
1,APPL,300
2,MSTR,120


In [100]:
file = ('LA_weather.parquet')
df = pd.read_parquet(file,
                    engine='pyarrow')

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4899 entries, 0 to 4898
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          4899 non-null   object        
 1   NAME             4899 non-null   object        
 2   DATE             4899 non-null   object        
 3   PRCP             4899 non-null   float64       
 4   PRCP_ATTRIBUTES  4899 non-null   object        
 5   SNOW             121 non-null    float64       
 6   SNOW_ATTRIBUTES  121 non-null    object        
 7   SNWD             59 non-null     float64       
 8   SNWD_ATTRIBUTES  59 non-null     object        
 9   TAVG             3713 non-null   float64       
 10  TAVG_ATTRIBUTES  3713 non-null   object        
 11  TMAX             4899 non-null   int64         
 12  TMAX_ATTRIBUTES  4899 non-null   object        
 13  TMIN             4899 non-null   int64         
 14  TMIN_ATTRIBUTES  4899 non-null   object 

In [102]:
filters = [('year', '==', 2012)]
df_2012 = pd.read_parquet(file, 
                          engine='pyarrow', 
                          filters=filters)

In [103]:
df_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          366 non-null    object        
 1   NAME             366 non-null    object        
 2   DATE             366 non-null    object        
 3   PRCP             366 non-null    float64       
 4   PRCP_ATTRIBUTES  366 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             0 non-null      float64       
 10  TAVG_ATTRIBUTES  0 non-null      object        
 11  TMAX             366 non-null    int64         
 12  TMAX_ATTRIBUTES  366 non-null    object        
 13  TMIN             366 non-null    int64         
 14  TMIN_ATTRIBUTES  366 non-null    object   

In [104]:
filters = [('year', '>', 2020)]

df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [105]:
filters = [('year', '>=', 2021)]
pd.read_parquet(file, 
                engine='pyarrow', 
                filters= filters).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [106]:
filters = [('year', 'in', [2021, 2022, 2023])]
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   STATION          881 non-null    object        
 1   NAME             881 non-null    object        
 2   DATE             881 non-null    object        
 3   PRCP             881 non-null    float64       
 4   PRCP_ATTRIBUTES  881 non-null    object        
 5   SNOW             0 non-null      float64       
 6   SNOW_ATTRIBUTES  0 non-null      object        
 7   SNWD             0 non-null      float64       
 8   SNWD_ATTRIBUTES  0 non-null      object        
 9   TAVG             881 non-null    float64       
 10  TAVG_ATTRIBUTES  881 non-null    object        
 11  TMAX             881 non-null    int64         
 12  TMAX_ATTRIBUTES  881 non-null    object        
 13  TMIN             881 non-null    int64         
 14  TMIN_ATTRIBUTES  881 non-null    object   

In [107]:
columns = ['DATE', 'year', 'TMAX']
df = pd.read_parquet(file, 
                     engine='pyarrow', 
                     filters=filters, 
                     columns=columns)

df.head()

Unnamed: 0,DATE,year,TMAX
0,2021-01-01,2021,67
1,2021-01-02,2021,63
2,2021-01-03,2021,62
3,2021-01-04,2021,59
4,2021-01-05,2021,57


## SQLite

In [1]:
import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

In [2]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data)
con.commit()

In [3]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [5]:
import pandas as pd

In [7]:
pd.read_sql_query("SELECT * FROM test", con)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


## Creating a DataFrame


In [10]:
# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'San Francisco', 'Los Angeles']
}
df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,San Francisco
2,Charlie,35,Los Angeles


In [12]:
# Accessing columns
df['Name']


0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

In [14]:
# Accessing rows by index
df.loc[0]
df.iloc[0]


Name       Alice
Age           25
City    New York
Name: 0, dtype: object