In [3]:
#Import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [5]:
#Import data
avalanche_df = pd.read_csv("avalanche_data.csv")
avalanche_df.head()

Unnamed: 0,obs_date_time,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard,avalanche_obs_date_time
0,11/12/2015,Mt Roberts Tram Wx,OVC,SN,29.6,32.3,31.9,12.6,7.0,SE,27.0,42.0,0,
1,11/13/2015,Mt Roberts Tram Wx,OVC,SN,31.6,32.4,31.7,14.2,5.0,SE,26.0,29.0,0,
2,11/13/2015,Speel Arm Balcony Wx,OVC,SN,30.6,32.5,31.4,19.0,5.0,SSW,7.1,23.3,0,
3,11/14/2015,Mt Roberts Tram Wx,OVC,SN,31.6,32.4,31.8,22.4,7.0,SSE,10.0,10.0,0,
4,11/14/2015,Snowslide Creek Wx,OVC,RA,31.2,33.4,31.2,0.0,0.0,SSW,1.1,25.1,0,


In [184]:
#Clean data (edit target column)

#Edit target column (Replace Null with No)
avalanche_df["avalanche_obs_date_time"].fillna("No", inplace = True)

#Edit target column (Replace dates with Yes)
avalanche_df['avalanche_obs_date_time'] = avalanche_df["avalanche_obs_date_time"].replace(["2/2/2019", "2/8/2019", "2/20/2019", "2/28/2019", "3/3/2019", "3/18/2019", "3/19/2019","1/14/2020", "1/15/2020", "1/17/2020", "1/25/2020", "1/31/2020", "2/2/2020", "2/26/2020", 
"2/6/2020", "2/9/2020", "2/11/2020", "2/12/2020", "2/24/2020", "2/26/2020", "2/27/2020", "2/29/2020", "3/7/2020", "4/11/2020", "4/17/2020", "5/1/2020", "11/10/2020", "11/13/2020", "12/25/2020", "12/27/2020", 
"1/3/2021", "1/8/2021", "1/9/2021", "1/10/2021", "1/19/2021", "1/21/2021", "1/26/2021", "1/27/2021", "1/20/2021", "1/30/2021", "2/2/2021", "2/9/2021"], "Yes")

In [185]:
#Check that dates were changed to yes's
print(avalanche_df['avalanche_obs_date_time'].value_counts()['Yes'])

261


In [186]:
#Drop observation date
avalanche_df = avalanche_df.drop('obs_date_time',axis=1)

In [187]:
#Encode wind direction 
wind_direction_counts = avalanche_df.wind_direction.value_counts()
wind_direction_counts

# Determine which values to replace
replace_wind_direction = list(wind_direction_counts[wind_direction_counts < 150].index)

# Replace in DataFrame
for direction in replace_wind_direction:
    avalanche_df.wind_direction = avalanche_df.wind_direction.replace(direction,"Other")

#Encode
le = LabelEncoder()
avalanche_df['wind_direction'] = le.fit_transform(avalanche_df['wind_direction'])

avalanche_df.head()

Unnamed: 0,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard,avalanche_obs_date_time
0,Mt Roberts Tram Wx,OVC,SN,29.6,32.3,31.9,12.6,7.0,9,27.0,42.0,0,No
1,Mt Roberts Tram Wx,OVC,SN,31.6,32.4,31.7,14.2,5.0,9,26.0,29.0,0,No
2,Speel Arm Balcony Wx,OVC,SN,30.6,32.5,31.4,19.0,5.0,10,7.1,23.3,0,No
3,Mt Roberts Tram Wx,OVC,SN,31.6,32.4,31.8,22.4,7.0,7,10.0,10.0,0,No
4,Snowslide Creek Wx,OVC,RA,31.2,33.4,31.2,0.0,0.0,10,1.1,25.1,0,No


In [188]:
#Encode sky cover
sky_cover_counts = avalanche_df.sky_cover.value_counts()
sky_cover_counts

le = LabelEncoder()
avalanche_df['sky_cover'] = le.fit_transform(avalanche_df['sky_cover'])

avalanche_df.head()

Unnamed: 0,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard,avalanche_obs_date_time
0,Mt Roberts Tram Wx,3,SN,29.6,32.3,31.9,12.6,7.0,9,27.0,42.0,0,No
1,Mt Roberts Tram Wx,3,SN,31.6,32.4,31.7,14.2,5.0,9,26.0,29.0,0,No
2,Speel Arm Balcony Wx,3,SN,30.6,32.5,31.4,19.0,5.0,10,7.1,23.3,0,No
3,Mt Roberts Tram Wx,3,SN,31.6,32.4,31.8,22.4,7.0,7,10.0,10.0,0,No
4,Snowslide Creek Wx,3,RA,31.2,33.4,31.2,0.0,0.0,10,1.1,25.1,0,No


In [189]:
#Encode precipitation type
precip_type_counts = avalanche_df.precip_type.value_counts()
precip_type_counts

le = LabelEncoder()
avalanche_df['precip_type'] = le.fit_transform(avalanche_df['precip_type'])

avalanche_df.head()

Unnamed: 0,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard,avalanche_obs_date_time
0,Mt Roberts Tram Wx,3,4,29.6,32.3,31.9,12.6,7.0,9,27.0,42.0,0,No
1,Mt Roberts Tram Wx,3,4,31.6,32.4,31.7,14.2,5.0,9,26.0,29.0,0,No
2,Speel Arm Balcony Wx,3,4,30.6,32.5,31.4,19.0,5.0,10,7.1,23.3,0,No
3,Mt Roberts Tram Wx,3,4,31.6,32.4,31.8,22.4,7.0,7,10.0,10.0,0,No
4,Snowslide Creek Wx,3,2,31.2,33.4,31.2,0.0,0.0,10,1.1,25.1,0,No


In [190]:
#Encode location
obs_location_counts = avalanche_df.obs_location.value_counts()
obs_location_counts

# Determine which values to replace
replace_obs_location = list(obs_location_counts[obs_location_counts < 200].index)

# Replace in DataFrame
for location in replace_obs_location:
    avalanche_df.obs_location = avalanche_df.obs_location.replace(location,"Other")

#Encode
le = LabelEncoder()
avalanche_df['obs_location'] = le.fit_transform(avalanche_df['obs_location'])

avalanche_df.head()

Unnamed: 0,obs_location,sky_cover,precip_type,air_temp_min,air_temp_max,air_temp_current,snow_height,new_snow_height,wind_direction,wind_speed,wind_gust,hazard,avalanche_obs_date_time
0,3,3,4,29.6,32.3,31.9,12.6,7.0,9,27.0,42.0,0,No
1,3,3,4,31.6,32.4,31.7,14.2,5.0,9,26.0,29.0,0,No
2,3,3,4,30.6,32.5,31.4,19.0,5.0,10,7.1,23.3,0,No
3,3,3,4,31.6,32.4,31.8,22.4,7.0,7,10.0,10.0,0,No
4,3,3,2,31.2,33.4,31.2,0.0,0.0,10,1.1,25.1,0,No


In [191]:
#Create feature and target variables (x and y)
X = avalanche_df.drop("avalanche_obs_date_time", axis=1)
y = avalanche_df["avalanche_obs_date_time"]

In [192]:
#Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [193]:
#Scale data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [194]:
#Create logistic regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

In [195]:
#Classify logistic regression model
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [196]:
#Create predictions
y_pred = classifier.predict(X_test_scaled)

  "X does not have valid feature names, but"


In [197]:
#Display the confusion matrix
y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

  "X does not have valid feature names, but"


array([[828,  16],
       [ 57,   4]], dtype=int64)

In [198]:
#Classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

          No       0.94      0.98      0.96       844
         Yes       0.20      0.07      0.10        61

    accuracy                           0.92       905
   macro avg       0.57      0.52      0.53       905
weighted avg       0.89      0.92      0.90       905



In [199]:
#Get accuracy score
print(accuracy_score(y_test, y_pred))

0.9193370165745857
