In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Load Data
weather = pd.read_csv('weather.csv')
train = pd.read_csv('train.csv')

# Simplified Weather Data Cleaning and Feature Engineering
# Handling 'M' and '-' in weather data
weather.replace('M', np.nan, inplace=True)
weather.replace('-', np.nan, inplace=True)
weather.replace('T', 0.005, inplace=True)  # Handling Trace amounts in precipitation

# Type Conversion
weather['Tavg'] = pd.to_numeric(weather['Tavg'], errors='coerce')
weather['PrecipTotal'] = pd.to_numeric(weather['PrecipTotal'], errors='coerce')
weather['WetBulb'] = pd.to_numeric(weather['WetBulb'], errors='coerce')
weather['StnPressure'] = pd.to_numeric(weather['StnPressure'], errors='coerce')
weather['SeaLevel'] = pd.to_numeric(weather['SeaLevel'], errors='coerce')
weather['AvgSpeed'] = pd.to_numeric(weather['AvgSpeed'], errors='coerce')

# Filling NaN Values
weather['Tavg'].fillna(weather['Tavg'].mean(), inplace=True)
weather['PrecipTotal'].fillna(weather['PrecipTotal'].mean(), inplace=True)
weather['WetBulb'].fillna(weather['WetBulb'].mean(), inplace=True)
weather['StnPressure'].fillna(weather['StnPressure'].mean(), inplace=True)
weather['SeaLevel'].fillna(weather['SeaLevel'].mean(), inplace=True)
weather['AvgSpeed'].fillna(weather['AvgSpeed'].mean(), inplace=True)

weather['Date'] = pd.to_datetime(weather['Date'])
train['Date'] = pd.to_datetime(train['Date'])

# Averaging Weather Data from Station 1 and 2
# weather_stations_avg = weather.groupby('Date').mean().reset_index()

# # Merging Train and Weather Data
# train['Date'] = pd.to_datetime(train['Date'])
# weather_stations_avg['Date'] = pd.to_datetime(weather_stations_avg['Date'])
# merged_df = train.merge(weather_stations_avg, on='Date', how='left')

# # Modeling: Example with Logistic Regression
# # Features and Target
# features = merged_df[['Latitude', 'Longitude', 'Tavg', 'PrecipTotal']]  # Example features
# target = merged_df['WnvPresent']

# # Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(
#     features, target, test_size=0.2, random_state=42, stratify=target
# )

# # Logistic Regression Model
# lr_model = LogisticRegression(random_state=42)
# lr_model.fit(X_train, y_train)

# # Predictions and Probabilities
# y_pred = lr_model.predict(X_test)
# y_proba = lr_model.predict_proba(X_test)[:, 1]

# # Model Performance Metrics
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("ROC-AUC Score: ", roc_auc_score(y_test, y_proba))

# ROC Curve
# plt.figure(figsize=(8, 6))
# plot_roc_curve(lr_model, X_test, y_test)
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
# plt.show()


TypeError: can only concatenate str (not "int") to str

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Plotting
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
