In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
#importing warnings and filtering out warnings to ignore it
warnings.filterwarnings("ignore")

In [None]:
#reading csv data using pandas
df = pd.read_csv('LandslideDataset.csv')

In [None]:
#reading top 5 records
df.head()

In [None]:
df = df.drop("date", axis=1)

In [None]:
#reading top 5 records
df.head()

In [None]:
#info helps get quick insite on data fields - null counts, data type and column count with that dtype, total records, total columns
df.info()

In [None]:
#gives null count for records in each column
df.isnull().sum()

In [None]:
df["lon"].value_counts(dropna=False)

In [None]:
#removing row having NaN data
df = df[df['lat'].notna()]

In [None]:
#gives null count for records in each column
df.isnull().sum()

In [None]:
#gets measure of central tendancy along with q1, q2, q3
df.describe()

In [None]:
#separating data into x and y i.e separating into features and target
features = df.iloc[:,:-1]
target = df.iloc[:,-1]

In [None]:
features

In [None]:
target

## Data Cleaning is not required as we don't have any numberical column as object

## Label Encoding

In [None]:
# separating numerical and categorical columns
num_cols = df.select_dtypes(["int", "float"])
cat_cols = df.select_dtypes(["object"])

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
for i in cat_cols:
    cat_cols[i] = le.fit_transform(cat_cols[i])

## Scaling numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
num_cols_scaled = sc.fit_transform(num_cols)

In [None]:
for i in range(len(num_cols.columns)):
    num_cols[num_cols.columns[i]] = num_cols_scaled[:,i]

In [None]:
cat_cols

## Concatinating Scaled numerical cols and label encoded categorical cols

In [None]:
df1 = pd.concat([num_cols,cat_cols], axis=1)
x = df1.iloc[:,:-1]
y = df1.iloc[:,-1]

In [None]:
y.value_counts()

In [None]:
plt.figure(figsize=(3,3))
y.value_counts().plot(kind='bar')
plt.xticks(rotation=0)
plt.grid()
plt.show()

## Data is imbalanced , need to do sampling using Oversampling or Undersampling

In [None]:
#!pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x, y)
x=x_ros
y=y_ros

In [None]:
y_ros.value_counts()

In [None]:
plt.figure(figsize=(3,3))
y_ros.value_counts().plot(kind='bar')
plt.xticks(rotation=0)
plt.grid()
plt.show()

## The Data is cleaned and can be used for training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [None]:
# Spliting dataset into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=42)
svm = SVC()

In [None]:
def mymodel(model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_test,y_pred))
    return model

In [None]:
mymodel(logreg)

In [None]:
mymodel(logreg)

In [None]:
mymodel(knn)

In [None]:
mymodel(dt)

In [None]:
mymodel(rf)

In [None]:
mymodel(svm)

# Hyperparameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#,'newton-cg'
parameters={'solver':['poly','liblinear','lbfgs','saga','sag'],'penalty':['none','l1','l2','elasticnet'],'C':[100,10,1,0.1,0.01]}
gs=GridSearchCV(logreg,parameters,verbose=3)

In [None]:
gs.fit(x_train,y_train)

In [None]:
gs.best_params_

In [None]:
logreg=LogisticRegression(C=100,penalty='none',solver='lbfgs')

logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
print(classification_report(y_test,y_pred))

# GridSearchCV For Decision Tree

In [None]:
parameters={'max_depth':['none',10,20,30],'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,4]}
gs=GridSearchCV(dt,parameters,verbose=3)

In [None]:
gs.fit(x_train,y_train)

In [None]:
gs.best_params_

In [None]:
dt=DecisionTreeClassifier(max_depth=10,min_samples_leaf=1,min_samples_split=2)
                          
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
logreg.score(x_train,y_train)

In [None]:
logreg.score(x_test,y_test)

In [None]:
y_pred

In [None]:
df2 = x_test
df2['prediction'] = y_pred
df2['actual'] = y_test

In [None]:
df2.to_csv("pred.csv")