<a href="https://colab.research.google.com/github/mohitgitgeek/MohitVS.Datahack/blob/main/DATAHACKMOHITVS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Read the data files (replace with actual file paths)
file1_path = "/content/training_set_features.csv"
file2_path = "/content/training_set_labels.csv"
file3_path = "/content/test_set_features.csv"
file4_path = "/content/submission_format.csv"

# Read data into DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)
df3 = pd.read_csv(file3_path)
df4 = pd.read_csv(file4_path)

# Combine data from all files (if needed)
combined_df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Features (select relevant features)
selected_features = ['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
                     'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
                     'behavioral_large_gatherings', 'behavioral_outside_home',
                     'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
                     'chronic_med_condition', 'child_under_6_months', 'health_worker',
                     'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
                     'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
                     'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

X = combined_df[selected_features]
y_xyz = combined_df['seasonal_vaccine']
y_seasonal = combined_df['seasonal_vaccine']

# Handle missing values
X_imputed = X.fillna(X.mean())
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply PCA for feature transformation
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Initialize classifiers
rf_xyz = RandomForestClassifier(n_estimators=100, random_state=42)
svm_xyz = SVC(kernel='linear', C=1.0, random_state=42)
lr_xyz = LogisticRegression(max_iter=1000, random_state=42)
nb_xyz = GaussianNB()

# Split data into train and test sets
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X_pca, y_xyz, test_size=0.2, random_state=42)

# Handle missing values in the target variable
y_xyz = combined_df['seasonal_vaccine'].fillna(y_xyz.mean())  # Fill NaN with mean

# Split data into train and test sets (using the updated y_xyz)
X_train, X_test, y_train_xyz, y_test_xyz = train_test_split(X_pca, y_xyz, test_size=0.2, random_state=42)

# Train classifiers
rf_xyz.fit(X_train, y_train_xyz)
svm_xyz.fit(X_train, y_train_xyz)
lr_xyz.fit(X_train, y_train_xyz)
nb_xyz.fit(X_train, y_train_xyz)

# Make predictions
y_pred_rf_xyz = rf_xyz.predict(X_test)
y_pred_svm_xyz = svm_xyz.predict(X_test)
y_pred_lr_xyz = lr_xyz.predict(X_test)
y_pred_nb_xyz = nb_xyz.predict(X_test)

# Evaluate accuracy
accuracy_rf_xyz = accuracy_score(y_test_xyz, y_pred_rf_xyz)
accuracy_svm_xyz = accuracy_score(y_test_xyz, y_pred_svm_xyz)
accuracy_lr_xyz = accuracy_score(y_test_xyz, y_pred_lr_xyz)
accuracy_nb_xyz = accuracy_score(y_test_xyz, y_pred_nb_xyz)

print(f"Accuracy for RandomForest (xyz_vaccine): {accuracy_rf_xyz:.2f}")
print(f"Accuracy for SVM (xyz_vaccine): {accuracy_svm_xyz:.2f}")
print(f"Accuracy for Logistic Regression (xyz_vaccine): {accuracy_lr_xyz:.2f}")
print(f"Accuracy for Naive Bayes (xyz_vaccine): {accuracy_nb_xyz:.2f}")

# Create a DataFrame with respondent_id, xyz_vaccine, and seasonal_vaccine
results_df = pd.DataFrame({
    'respondent_id': combined_df['respondent_id'],
    'xyz_vaccine': y_pred_xyz,
    'seasonal_vaccine': y_pred_seasonal
})

# Save the results to a CSV file
results_df.to_csv('vaccine_predictions.csv', index=False)
print("Results saved to vaccine_predictions.csv")
