In [None]:
#### Data case approach  ####

In [None]:
## 1. SCQ

#S: 
#C:
#Q:

In [None]:
### 2. Prepare dataset

##2.a Load data
import pandas as pd

data = pd.read_csv('data.csv') 

#Create train-test set
train = data.sample(frac=0.8, random_state=27)  #ID 1-891
test = data.drop(train.index)                   #ID 892 - 1309

#train.shape
#test.shape

data_set = pd.concat([train, test], ignore_index = True)

##2.b Look at data
data_set.head(10)
data_set.info()
data_set.describe()             #numerical
data.describe(include=['O'])    #categorical => look if Names are different

data.duplicated().value_counts()
data_set.isna().sum()
data_set.isnull().sum()

import numpy as np
df = dailyActivity

numeric_cols = df.select_dtypes(include=[np.number]).columns
has_negative_values = (df[numeric_cols] < 0).any().any()

if has_negative_values:
    print("The DataFrame contains negative values.")
else:
    print("The DataFrame does not contain negative values.")


#Limitations dataset (ROCCC):
    #Reliable — LOW — Not reliable as it only has 30 respondents, which is not representative of the entire fitness population.
    #Original — LOW — Third party provider (Amazon Mechanical Turk)
    #Comprehensive — MEDIUM — Parameters match most of Bellabeat products’ parameters
    #Current — LOW — Data is 5 years old and may not be relevant: users’ daily activity, fitness and sleeping habits, diet and food consumption might have changed since the
    #Cited — LOW — Data collected from third party, hence unknown. As data is collected in a survey, we are unable to ascertain its integrity or accuracy.

In [None]:
## 3. Clean and manage data
from datetime import date

##3.a Drop features

#Features do not correlate to y
data_set = data_set.drop(['..', '...'], axis=1) #drop columns

#Highly incomplete
data_set = data_set.dropna(subset=['...', '...']) #drop rows with NaN

#Duplicates
data["ride_id"].duplicated().sum()

##3.b Complete features

#Age
guess_ages = np.zeros((2,3))
data_set['Sex'] = data_set['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
train = data_set.loc[data_set['PassengerId'] <= 891]

for i in range(0, 2):
    for j in range(0, 3):
        guess_ages[i,j] = train[(train['Sex'] == i) & (train['Pclass'] == j+1)]['Age'].dropna().median().astype(int)        #use training data to guess
        data_set.loc[ (data_set.Age.isnull()) & (data_set.Sex == i) & (data_set.Pclass == j+1),'Age'] = guess_ages[i,j]

#Title
data_set['Title'] = data_set.Name.str.extract('([A-Za-z]+)\.', expand=False) #Extracting is all letters (both upper and lowercase) before the point.
data_set = data_set.drop(['Name'], axis=1)
pd.crosstab(train['Title'], train['Sex']) #look which to replace

data_set['Title'] = data_set['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare').replace('Mlle', 'Miss').replace('Ms', 'Miss').replace('Mme', 'Mrs')
data_set[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()  #look which titles are left

#3.c Create features

#Familysize
data_set['FamilySize'] = data_set['SibSp'] + data_set['Parch'] + 1
data_set = data_set.drop(['SibSp','Parch'], axis=1)

#Bike length
data['started_at'] = data['started_at'].astype('datetime64')
data['ended_at'] = data['ended_at'].astype('datetime64')

data['ride_length'] = (data['ended_at'] - data['started_at'])/pd.Timedelta(minutes=1)
data['ride_length'] = data['ride_length'].astype('int32')

data = data[data['ride_length'] > 0]
data = data.reset_index()
data = data.drop(columns=['index'])

data["hour"] = data["started_at"].dt.hour
data["weekday"] = data["started_at"].dt.day_name()
data["day"] = data["started_at"].dt.day
data["month"] = data["started_at"].dt.month
data["month_name"] = data["started_at"].dt.month_name()
data["year"] = data["started_at"].dt.year

#Calculate Age
def calculate_age(born):
    today = date.today()
    try: 
        birthday = born.replace(year=today.year)
    except ValueError: # raised when birth date is February 29 and the current year is not a leap year
        birthday = born.replace(year=today.year, month=born.month+1, day=1)
    if birthday > today:
        return today.year - born.year - 1
    else:
        return today.year - born.year


##2.d Categorical to numerical
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
data_set['Title'] = data_set['Title'].map(title_mapping).astype(int)
data_set['Title'] = data_set['Title'].fillna(5).astype(int) #Make unknown a Rare

data_set['AgeBand'] = pd.cut(train['Age'], 5, precision=0) #check age band
    #data_set[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

data_set = data_set.drop(['AgeBand'], axis=1)
data_set.loc[ data_set['Age'] <= 16, 'Age'] = 0
data_set.loc[(data_set['Age'] > 16) & (data_set['Age'] <= 32), 'Age'] = 1
data_set.loc[(data_set['Age'] > 32) & (data_set['Age'] <= 48), 'Age'] = 2
data_set.loc[(data_set['Age'] > 48) & (data_set['Age'] <= 64), 'Age'] = 3
data_set.loc[ data_set['Age'] > 64, 'Age'] = 4
data_set["Age"] = data_set["Age"].astype(int)

#freq_port = train.Embarked.dropna().mode()[0] if you want to use average
data['Embarked'] = data['Embarked'].fillna(freq_port).map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

#3.e Save

data21 = data
data21.to_csv("data21_clean.csv")

In [None]:
#4. Analyse

day_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
data21['weekday'] = pd.Categorical(data21['weekday'], categories=day_order, ordered=True)
month_order = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
data21['month_name'] = pd.Categorical(data21['month_name'], categories=month_order, ordered=True)

data21_v1 = data21.groupby('member_casual', as_index = False)['ride_length'].agg('mean')
data21_v2_week = data21.loc[(data21['weekday'] != 'Saturday') & (data21['weekday'] != 'Sunday')].groupby(['member_casual','hour'], as_index=False)['ride_length'].agg('mean')
data21_v2_weekend = data21.loc[(data21['weekday'] == 'Saturday') | (data21['weekday'] == 'Sunday')].groupby(['member_casual','hour'], as_index=False)['ride_length'].agg('mean')
data21_v3 = data21.groupby(['member_casual','weekday'], as_index=False)['ride_length'].agg('mean')
data21_v4 = data21.groupby(['member_casual','month_name'], as_index=False)['ride_length'].agg('mean')

data21_v5 = data21.groupby('member_casual', as_index = False)['ride_id'].agg('count')
data21_v6_week = data21.loc[(data21['weekday'] != 'Saturday') & (data21['weekday'] != 'Sunday')].groupby(['member_casual','hour'], as_index=False)['ride_id'].agg('count')
data21_v6_weekend = data21.loc[(data21['weekday'] == 'Saturday') | (data21['weekday'] == 'Sunday')].groupby(['member_casual','hour'], as_index=False)['ride_id'].agg('count')
data21_v7 = data21.groupby(['member_casual','weekday'], as_index=False)['ride_id'].agg('count')
data21_v8 = data21.groupby(['member_casual', 'month_name'], as_index=False)['ride_id'].agg('count')

data21_v9 = data21.groupby(['member_casual','rideable_type'], as_index=False)['ride_id'].agg('count')

In [None]:
#5. Vizualize

# Total Member vs Casual
fig = plt.figure(figsize =(10, 7))
plt.pie(data21_v1['ride_length'], labels = data21_v1['member_casual'])
plt.title("Average Ride Length Member vs Casual Riders in 2021")
plt.savefig('Average Ride Length Member vs Casual in 2021', dpi=300, bbox_inches='tight')
plt.show()

fig = plt.figure(figsize =(10, 7))
plt.pie(data21_v5['ride_id'], labels = data21_v5['member_casual'])
plt.title("Number of rides Member vs Casual Riders in 2021")
plt.savefig('Number of rides Member vs Casual Riders in 2021', dpi=300, bbox_inches='tight')
plt.show()

#Member vs Casual per Day
fig = plt.figure(figsize =(10, 7))
sns.lineplot(x='hour', y='ride_length', hue = 'member_casual', data=data21_v2_week])
plt.title("Average Ride Length Member vs Casual per Day")
plt.xlabel("Hour")
plt.ylabel("Average Ride Length")
plt.legend(title='')
plt.savefig('Average Ride Length Member vs Casual per Day.png', dpi=300)
plt.show()

fig = plt.figure(figsize =(10, 7))
sns.lineplot(x='hour', y='ride_id', hue = 'member_casual', data=data21_v6_week)
plt.title("Number of Rides Member vs Casual per Day")
plt.xlabel("Hour")
plt.ylabel("Number of Rides")
plt.legend(title='')
plt.savefig('Number of Rides Member vs Casual per Day.png', dpi=300)
plt.show()

#Member vs Casual per day of the week
fig = plt.figure(figsize =(10, 7))
sns.lineplot(x='weekday', y='ride_length', hue = 'member_casual', data=data21_v3)
plt.title("Average Ride Length Member vs Casual on Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Average Ride Length")
plt.legend(title='')
plt.savefig('Average Ride Length Member vs Casual on Day of Week.png', dpi=300)
plt.show()

fig = plt.figure(figsize =(10, 7))
sns.lineplot(x='weekday', y='ride_id', hue = 'member_casual', data=data21_v7)
plt.title("Number of Rides Member vs Casual on Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Number of Rides")
plt.legend(title='')
plt.savefig('Number of Rides Member vs Casual on Day of Week.png', dpi=300)
plt.show()

#Member vs Casual per Month
fig = plt.figure(figsize =(15, 7))
sns.lineplot(x = 'month_name', y = 'ride_length', hue = 'member_casual', data=data21_v4)
plt.title("Average Ride Length in Each Month")
plt.xlabel("Month")
plt.ylabel("Average Ride Length")
plt.legend(title='')
plt.savefig('Average Ride Length in Each Month.png', dpi=300)
plt.show()

fig = plt.figure(figsize =(15, 7))
sns.lineplot(x = 'month_name', y = 'ride_id', hue = 'member_casual', data=data21_v8)
plt.title("Number of Rides in Each Month")
plt.xlabel("Month")
plt.ylabel("Number of Rides")
plt.legend(title='')
plt.savefig('Number of Rides in Each Month.png', dpi=300)
plt.show()

#Member vs Casual per bike type
fig = plt.figure(figsize =(10, 7))
sns.barplot(x = 'rideable_type', y = 'ride_id', hue = 'member_casual', data=data21_v9)
plt.title("Preferred bike by users")
plt.xlabel("Bike type")
plt.ylabel("Number of rides")
plt.legend(title='')
plt.savefig('Preferred bike by usersh.png', dpi=300)
plt.show()

In [None]:
#6. Conclusion and recommendation

Conclusion:
Total:
    - The majority of the riders are member users.
    - The average trip duration for the casual rider is more than the members. 

Week:
    - Both casual riders and members use bikes for long rides during weekends
    - Casual riders more often use bikes for leisure or personal activities. Their usage is higher on weekends.

Year:
    - During summer months number of rides at its highest level for both casual and member riders.

Type:
    - Classic bikes are the most popular kind.
    - Only casual users use docked bikes, membership users prefer classic bikes and both of use the electric bikes almost equally.

Recommendation:
- The Marketing Team could set up a campain in the app (no advertisement costs!) how using a bikeshare can improve the commute to work: less traffic, more sustainable. Additionally, send out information on the advantages of using a annual membership.
- Offering some kind of promotion or launching marketing campaigns during summer, or occasionally in weekends.
- Special 'Summer Membership' can be introduced specifically for casual riders who're hesitant to go for annual membership.


Next:
    - Go more into detail per hour
    - Location-based advertisemets (features Instagram and Facebook)

In [None]:
#Regression
train = data.loc[data['PassengerId'] <= 891]
train = train.drop(['PassengerId'] , axis=1)
test = data.drop(train.index)

from sklearn.ensemble import RandomForestClassifier
y_train = train["Survived"]
X_train = train.drop(['Survived'])
X_test = test.drop(['Survived'])
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")