# Data Analysis with Pandas
## Dataset : [trip_ds,weather_ds,station_ds]
## Odeh Abuzaid
## 25/10/2021

In [1]:
import pandas as pd
import numpy as np

In [2]:
trip_ds = pd.read_csv('./input/trip.csv')
trip_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286858 entries, 0 to 286857
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   trip_id            286858 non-null  int64  
 1   starttime          286858 non-null  object 
 2   stoptime           286858 non-null  object 
 3   bikeid             286858 non-null  object 
 4   tripduration       286858 non-null  float64
 5   from_station_name  286858 non-null  object 
 6   to_station_name    286858 non-null  object 
 7   from_station_id    286858 non-null  object 
 8   to_station_id      286858 non-null  object 
 9   usertype           286858 non-null  object 
 10  gender             181558 non-null  object 
 11  birthyear          181553 non-null  float64
dtypes: float64(2), int64(1), object(9)
memory usage: 26.3+ MB


In [3]:
weather_ds = pd.read_csv('./input/weather.csv')
weather_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        689 non-null    object 
 1   Max_Temperature_F           689 non-null    int64  
 2   Mean_Temperature_F          688 non-null    float64
 3   Min_TemperatureF            689 non-null    int64  
 4   Max_Dew_Point_F             689 non-null    int64  
 5   MeanDew_Point_F             689 non-null    int64  
 6   Min_Dewpoint_F              689 non-null    int64  
 7   Max_Humidity                689 non-null    int64  
 8   Mean_Humidity               689 non-null    int64  
 9   Min_Humidity                689 non-null    int64  
 10  Max_Sea_Level_Pressure_In   689 non-null    float64
 11  Mean_Sea_Level_Pressure_In  689 non-null    float64
 12  Min_Sea_Level_Pressure_In   689 non-null    float64
 13  Max_Visibility_Miles        689 non

In [4]:
station_ds = pd.read_csv('./input/station.csv')
station_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   station_id         58 non-null     object 
 1   name               58 non-null     object 
 2   lat                58 non-null     float64
 3   long               58 non-null     float64
 4   install_date       58 non-null     object 
 5   install_dockcount  58 non-null     int64  
 6   modification_date  17 non-null     object 
 7   current_dockcount  58 non-null     int64  
 8   decommission_date  4 non-null      object 
dtypes: float64(2), int64(2), object(5)
memory usage: 4.2+ KB


## <ins>*What is the average trip duration for a borrowed bicycle?*

In [5]:

trips = trip_ds.tripduration
trip_durations_avg = trips.mean()
trip_durations_avg

1178.3542840325176

## <ins>*What’s the most common age of a bicycle-sharer?*

In [6]:
from datetime import date
# birthyear
trips = trip_ds.birthyear
sharer_birth_years_avg = trips.mode().item()

current_year = date.today().year
most_common_age_of_sharer = int(current_year) - int(sharer_birth_years_avg)
most_common_age_of_sharer

34

## <ins>*Given all the weather data here, find the average precipitation per month, and the median precipitation.*

In [7]:
weather_ds.Date = weather_ds.Date.apply(pd.to_datetime)

weather_ds['Month'] = pd.DatetimeIndex(weather_ds.Date).month

mean = weather_ds[["Month", "Precipitation_In"]].groupby(by="Month").mean()\
    .rename(columns = {'Precipitation_In': 'Average'}, inplace = False)

median = weather_ds[["Month", "Precipitation_In"]].groupby(by="Month").median()\
    .rename(columns = {'Precipitation_In': 'Median'}, inplace = False)

In [8]:
mean_median_per_month = pd.merge(mean, median, on='Month')
mean_median_per_month.loc['Precipitation per month'] = ['', '']

In [9]:
mean_median_per_month

Unnamed: 0_level_0,Average,Median
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.143548,0.02
2,0.168421,0.04
3,0.156935,0.025
4,0.051333,0.0
5,0.012419,0.0
6,0.0305,0.0
7,0.012097,0.0
8,0.018226,0.0
9,0.041,0.0
10,0.189,0.04


## <ins>*What’s the average number of bikes at a given bike station*

In [10]:
def get_mean_median_per_month(month):
    month-= 1
    return [str(mean.loc[month].item()),str(median.loc[month].item())]

In [11]:
current_doc_count  = station_ds.current_dockcount.mean()
current_doc_count

16.517241379310345

## <ins> **When a bike station is modified, is it more likely that it’ll lose bikes or gain bikes? How do you know?**

Yes , its more likely to __lose__ bikes when a bike station is modified.

the mean value for calculated diffirence between `current_count` and the `install` ON CHANGE is les than zero.
and the loss for the given data is about 3 bikes.

In [12]:
station_ds["delta"] = station_ds.current_dockcount.sub(station_ds.install_dockcount, axis = 0)

avg_bike_for_modified_dates = station_ds[pd.notnull(station_ds.modification_date)]["delta"].mean()

avg_bike_for_modified_dates

-3.764705882352941

## <ins>*what was the bike that had the longest trip?*

In [13]:
most_used_bike = trip_ds.tripduration.max()
most_used_bike = trip_ds.loc[trip_ds.tripduration == most_used_bike]
most_used_bike = most_used_bike.bikeid.item()
most_used_bike

'SEA00420'

## <ins>*what was the the coldest day Temperature?*

In [14]:
coldest_temp = weather_ds.Min_TemperatureF.min()
coldest_temp_c = (coldest_temp - 32) * 5/9

coldest_temp_day = weather_ds.loc[weather_ds.Min_TemperatureF == coldest_temp]
coldest_temp_day = coldest_temp_day.Date.item()
coldest_temp_c
coldest_temp_day = str(coldest_temp_day)
coldest_temp_day

'2016-01-02 00:00:00'

## <ins>*what was the day that has the most stations modifications?*

In [15]:
stations= station_ds.modification_date.mode()
most_station_modification_date = stations.item()
most_station_modification_date

'2/20/2015'

In [16]:
def test():
    
    def assert_equal(actual,expected):
        assert actual == expected, f"Expected {expected} but got {actual}"
    assert_equal(str(trip_durations_avg), '1178.3542840325176')
    assert_equal(most_common_age_of_sharer, 34)
    assert_equal(get_mean_median_per_month(2), ['0.1435483870967742', '0.02'])
    assert_equal(current_doc_count, 16.517241379310345)
    assert_equal(avg_bike_for_modified_dates, -3.764705882352941)
    assert_equal(most_used_bike, 'SEA00420')
    assert_equal(coldest_temp_day, '2016-01-02 00:00:00')
    assert_equal(most_station_modification_date, '2/20/2015')
    print("Success!!!")

test()

Success!!!
