In [1]:
import pandas as pd
import polars as pl
import numpy as np
import pyarrow


In [2]:
# Load the Parquet file
df = pl.read_parquet(r"D:\Project\data nyc fare data\train.parquet")
df.head()

key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date
str,f64,datetime[μs],f64,f64,f64,f64,i64,date
"""2009-06-15 17:26:21.0000001""",4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15
"""2010-01-05 16:52:16.0000002""",16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05
"""2011-08-18 00:35:00.00000049""",5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18
"""2012-04-21 04:30:42.0000001""",7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21
"""2010-03-09 07:51:00.000000135""",5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09


In [3]:
df.shape

(55423856, 9)

In [4]:
df.columns

['key',
 'fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'date']

In [5]:
df = df.drop(['key','date'])
df.shape

(55423856, 7)

In [6]:
df = df.with_columns(
    df['pickup_datetime'].dt.day().alias('date'),
    df['pickup_datetime'].dt.month().alias('month'),
    df['pickup_datetime'].dt.year().alias('year'),
    df['pickup_datetime'].dt.weekday().alias('weekday'),
    df['pickup_datetime'].dt.hour().alias('hour'),
    df['pickup_datetime'].dt.time().alias('time') 
)

In [7]:
df.head()

fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,month,year,weekday,hour,time
f64,datetime[μs],f64,f64,f64,f64,i64,i8,i8,i32,i8,i8,time
4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,15,6,2009,1,17,17:26:21
16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,5,1,2010,2,16,16:52:16
5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,18,8,2011,4,0,00:35:00
7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,21,4,2012,6,4,04:30:42
5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,9,3,2010,2,7,07:51:00


In [8]:
df.shape

(55423856, 13)

duplicate_count = df.filter(df.is_duplicated()).shape[0]

print(f"Total number of duplicate rows: {duplicate_count}")

In [9]:
df = df.unique()
df.shape

(55422206, 13)

In [10]:
df.head()

fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,month,year,weekday,hour,time
f64,datetime[μs],f64,f64,f64,f64,i64,i8,i8,i32,i8,i8,time
6.5,2015-02-05 20:04:20,-73.993172,40.767735,-73.980034,40.77586,1,5,2,2015,4,20,20:04:20
4.5,2014-02-16 14:06:02,-73.960529,40.761612,-73.962217,40.767788,1,16,2,2014,7,14,14:06:02
6.5,2012-11-07 19:01:03,-73.986184,40.740117,-73.991788,40.732972,1,7,11,2012,3,19,19:01:03
6.1,2009-12-01 11:32:55,-73.954084,40.784497,-73.970287,40.768137,1,1,12,2009,2,11,11:32:55
12.1,2009-01-15 12:54:31,-73.982465,40.75603,-73.954085,40.77812,1,15,1,2009,4,12,12:54:31


In [11]:
pl.Config.set_tbl_cols(100) 

polars.config.Config

In [12]:
null_counts = df.select([
    pl.col(col).is_null().sum().alias(f"{col}_null_count") for col in df.columns
])

print(null_counts)

shape: (1, 13)
┌─────┬─────┬─────┬────────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┐
│ far ┆ pic ┆ pic ┆ pickup ┆ dropo ┆ dropo ┆ passe ┆ date_ ┆ month ┆ year_ ┆ weekd ┆ hour_ ┆ time_ │
│ e_a ┆ kup ┆ kup ┆ _latit ┆ ff_lo ┆ ff_la ┆ nger_ ┆ null_ ┆ _null ┆ null_ ┆ ay_nu ┆ null_ ┆ null_ │
│ mou ┆ _da ┆ _lo ┆ ude_nu ┆ ngitu ┆ titud ┆ count ┆ count ┆ _coun ┆ count ┆ ll_co ┆ count ┆ count │
│ nt_ ┆ tet ┆ ngi ┆ ll_cou ┆ de_nu ┆ e_nul ┆ _null ┆ ---   ┆ t     ┆ ---   ┆ unt   ┆ ---   ┆ ---   │
│ nul ┆ ime ┆ tud ┆ nt     ┆ ll_co ┆ l_cou ┆ _coun ┆ u32   ┆ ---   ┆ u32   ┆ ---   ┆ u32   ┆ u32   │
│ l_c ┆ _nu ┆ e_n ┆ ---    ┆ unt   ┆ nt    ┆ t     ┆       ┆ u32   ┆       ┆ u32   ┆       ┆       │
│ oun ┆ ll_ ┆ ull ┆ u32    ┆ ---   ┆ ---   ┆ ---   ┆       ┆       ┆       ┆       ┆       ┆       │
│ t   ┆ cou ┆ _co ┆        ┆ u32   ┆ u32   ┆ u32   ┆       ┆       ┆       ┆       ┆       ┆       │
│ --- ┆ nt  ┆ unt ┆        ┆       ┆       ┆       ┆       ┆       ┆       ┆

In [13]:
df= df.filter(
    (pl.col('pickup_longitude') != 0.0) &
    (pl.col('pickup_latitude') != 0.0) &
    (pl.col('dropoff_longitude') != 0.0) &
    (pl.col('dropoff_latitude') != 0.0)
)

In [14]:
df.shape

(54318972, 13)

In [15]:
type(null_counts)

polars.dataframe.frame.DataFrame

In [16]:
df = df.filter(
    (pl.col('pickup_latitude').is_between(-90, 90)) &
    (pl.col('dropoff_latitude').is_between(-90, 90))
)
df = df.filter(
    (pl.col('pickup_longitude').is_between(-180, 180)) &
    (pl.col('dropoff_longitude').is_between(-180, 180))
)

In [17]:
df.shape

(54316546, 13)

In [18]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return 6371 * c


In [19]:
df_pandas = df.to_pandas()

In [20]:
df_pandas['haversine_dist'] = df_pandas.apply(
    lambda row: haversine(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ),
    axis=1
)

# Convert back to Polars DataFrame
df = pl.from_pandas(df_pandas)

KeyboardInterrupt: 

In [1]:
!nvidia-smi

Thu Sep 19 22:44:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.80                 Driver Version: 546.80       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   55C    P0               8W /  40W |      0MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import polars as pl

df = pl.LazyFrame({"a": [1.242, 1.535]})

q = df.select(pl.col("a").round(1))

result = q.collect(engine="gpu")
print(result)

ModuleNotFoundError: GPU engine requested, but required package 'cudf_polars' not found.
Please install using the command `pip install cudf-polars-cu12` (or `pip install cudf-polars-cu11` if your system has a CUDA 11 driver).