<a href="https://colab.research.google.com/github/namakmurtaza/Predicting-disease-outbreak/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import plotnine as pltn
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [None]:
hep = pd.read_csv("/content/hepatitis.csv")
measles = pd.read_csv("/content/measles.csv")
mump = pd.read_csv("/content/mumps.csv")
polio = pd.read_csv("/content/polio.csv")
pert = pd.read_csv("/content/pertussis.csv")
rub = pd.read_csv("/content/rubella.csv")
smlpx = pd.read_csv("/content/smallpox.csv")


In [None]:
combined_dataset = None

dataset_files = [
    "/content/hepatitis.csv", "/content/measles.csv", "/content/mumps.csv",
    "/content/pertussis.csv", "/content/polio.csv", "/content/rubella.csv", "/content/smallpox.csv"
]

for file_path in dataset_files:
    dataset = pd.read_csv(file_path)

    if combined_dataset is None:
        combined_dataset = dataset
    else:
        try:
            combined_dataset = pd.concat([combined_dataset, dataset], ignore_index=True)
        except ValueError as e:
            print(f"Error concatenating dataset {file_path}: {e}")
        except Exception as e:
            print(f" error occurred: {e}")

# Data preprocessing
if combined_dataset is not None:
    try:
        combined_dataset['week'] = combined_dataset['week'].astype(str)
        combined_dataset['year'] = combined_dataset['week'].str.slice(0, 4).astype(int)
        combined_dataset['week_number'] = combined_dataset['week'].str.slice(4).astype(int)
    except Exception as e:
        print(f"Error splitting 'week' column into year and week number: {e}")
else:
    print("Concatenation failed.")

In [None]:
combined_dataset['cases'] = pd.to_numeric(combined_dataset['cases'], errors='coerce')
mean_cases = combined_dataset['cases'].mean()
combined_dataset['cases'].fillna(mean_cases, inplace=True)

In [None]:
missing_values = combined_dataset.isna().sum()
print("Missing Values:")
print(missing_values)
combined_dataset['cases'].fillna(0, inplace=True)


Missing Values:
week                    0
state                   0
state_name              0
disease                 0
cases                   0
incidence_per_capita    0
year                    0
week_number             0
dtype: int64


In [None]:
# Feature scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
combined_dataset['cases'] = scaler.fit_transform(combined_dataset[['cases']])

In [None]:
column_name = 'cases'
Q1 = combined_dataset[column_name].quantile(0.25)
Q3 = combined_dataset[column_name].quantile(0.75)
IQR = Q3 - Q1
outlier_threshold = 1.5
outliers = combined_dataset[(combined_dataset[column_name] < (Q1 - outlier_threshold * IQR)) | (combined_dataset[column_name] > (Q3 + outlier_threshold * IQR))]

median_value = combined_dataset[column_name].median()
combined_dataset.loc[outliers.index, column_name] = median_value

In [None]:
combined_dataset['cases_1_month_ago'] = combined_dataset.groupby(['state', 'disease'])['cases'].shift(1)
print(combined_dataset['cases_1_month_ago'])

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
            ...   
600479    0.000096
600480    0.000096
600481    0.000096
600482    0.000096
600483    0.000192
Name: cases_1_month_ago, Length: 600484, dtype: float64


In [None]:
combined_dataset['incidence_category'] = pd.cut(combined_dataset['incidence_per_capita'], bins=[0, 0.1, 0.5, 1, float('inf')], labels=['low', 'moderate', 'high', 'very high'])


In [None]:
threshold = 0.0005
combined_dataset['outbreak'] = (combined_dataset['cases'] > threshold).astype(int)


In [None]:
combined_dataset['cases_times_incidence'] = combined_dataset['cases'] * combined_dataset['incidence_per_capita']


In [None]:
print(combined_dataset.head())

     week state  state_name      disease     cases  incidence_per_capita  \
0  196601    AL     ALABAMA  HEPATITIS A  0.000481                  0.14   
1  196601    AR    ARKANSAS  HEPATITIS A  0.001057                  0.58   
2  196601    AZ     ARIZONA  HEPATITIS A  0.000577                  0.37   
3  196601    CA  CALIFORNIA  HEPATITIS A  0.000192                  0.47   
4  196601    CO    COLORADO  HEPATITIS A  0.000096                  0.05   

   year  week_number  cases_1_month_ago  cases_times_incidence  outbreak  \
0  1966            1                NaN               0.000067         0   
1  1966            1                NaN               0.000613         1   
2  1966            1                NaN               0.000213         1   
3  1966            1                NaN               0.000090         0   
4  1966            1                NaN               0.000005         0   

  incidence_category  
0           moderate  
1               high  
2           moder

RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and target variable (y)
X = combined_dataset[['cases', 'incidence_per_capita', 'year', 'week_number', 'cases_1_month_ago']]
y = combined_dataset['outbreak']
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)


Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95106
           1       1.00      1.00      1.00     24991

    accuracy                           1.00    120097
   macro avg       1.00      1.00      1.00    120097
weighted avg       1.00      1.00      1.00    120097



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Define the number of splits
n_splits = 5

# Create a StratifiedKFold object
stratified_kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Use stratified k-fold cross-validation
for train_index, test_index in stratified_kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Your modeling and evaluation code here


# Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model with the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best estimator (model with the best hyperparameters)
best_clf = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")





KeyError: ignored

In [None]:
import joblib

# Save the trained model to a file
model_filename = "random_forest_model.pkl"
joblib.dump(clf, model_filename)


['random_forest_model.pkl']

Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder

# Load your dataset (replace 'your_dataset.csv' with the actual path to your dataset)
data = pd.read_csv('/content/hepatitis.csv')

# Specify the target variable
target_column = 'cases'

# Identify categorical columns (replace 'categorical_columns' with the actual names of categorical columns)
categorical_columns = ['state', 'incidence_per_capita']

# Separate the categorical columns and non-categorical columns
categorical_data = data[categorical_columns]
numerical_data = data.drop(columns=[target_column] + categorical_columns)

# Encode categorical columns using one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_data = pd.get_dummies(categorical_data, columns=categorical_columns, drop_first=True)

# Combine the encoded categorical data with the numerical data
X = pd.concat([numerical_data, encoded_data], axis=1)
y = data[target_column]

# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)




ValueError: ignored

# To group regions with similar characterstics using K-Means Clustering

# MEASLES

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [None]:
features = data[['incidence_per_capita', 'cases']]

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)


In [None]:
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(scaled_features)

In [None]:
data['Cluster'] = kmeans.labels_

In [None]:
plt.scatter(data['incidence_per_capita'], data['cases'], c=data['Cluster'], cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.title('Clusters of Regions with Similar Infectious Disease Characteristics')
plt.xlabel('incidence_per_capita')
plt.ylabel('cases')
plt.show()

-->Silhouette Score

The Silhouette Score measures how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The score ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

data= pd.read_csv('/content/measles.csv')
data = np.random.rand(100, 2)

kmeans = KMeans(n_clusters=3)
labels = kmeans.fit_predict(data)

silhouette_avg = silhouette_score(data, labels)



In [None]:
print(f"Silhouette Score: {silhouette_avg}")

-->Davies-Bouldin Index

The Davies-Bouldin Index (DBI) is a metric for evaluating the quality of clusters in a dataset. It measures the compactness and separation between clusters. The lower the Davies-Bouldin Index, the better the clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import numpy as np

data= pd.read_csv('/content/measles.csv')
data = np.random.rand(100, 2)

# Fit K-means model
kmeans = KMeans(n_clusters=3)
labels = kmeans.fit_predict(data)

# Calculate Davies-Bouldin Index
dbi_score = davies_bouldin_score(data, labels)


In [None]:
print(f"Davies-Bouldin Index: {dbi_score}")

# SMALLPOX