<a href="https://colab.research.google.com/github/mohameddhamed/data-science-intro/blob/main/Step_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install scikit-learn pandas numpy matplotlib seaborn mlxtend

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
import os
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Setup complete! Now upload your MachineLearningCSV.zip file:")
print("Click the 'Choose Files' button below")

# Upload the dataset
uploaded = files.upload()

# Extract the ZIP file
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        with ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('dataset')
        print(f"✓ Extracted {filename}")

# List extracted files
print("\nExtracted files:")
for root, dirs, files in os.walk('dataset'):
    for file in files:
        if file.endswith('.csv'):
            print(f"  - {file}")

✓ Setup complete! Now upload your MachineLearningCSV.zip file:
Click the 'Choose Files' button below


Saving MachineLearningCVE.zip to MachineLearningCVE.zip
✓ Extracted MachineLearningCVE.zip

Extracted files:
  - ._Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
  - ._Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
  - ._Monday-WorkingHours.pcap_ISCX.csv
  - ._Tuesday-WorkingHours.pcap_ISCX.csv
  - ._Wednesday-workingHours.pcap_ISCX.csv
  - ._Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
  - ._Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
  - ._Friday-WorkingHours-Morning.pcap_ISCX.csv
  - Friday-WorkingHours-Morning.pcap_ISCX.csv
  - Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
  - Tuesday-WorkingHours.pcap_ISCX.csv
  - Monday-WorkingHours.pcap_ISCX.csv
  - Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
  - Wednesday-workingHours.pcap_ISCX.csv
  - Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
  - Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv


In [2]:
# Load all CSV files
csv_files = []
for root, dirs, files in os.walk('dataset'):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} CSV files")

# Load and concatenate
dfs = []
for file in csv_files:
    print(f"Loading {os.path.basename(file)}...")
    try:
        df = pd.read_csv(file)
        dfs.append(df)
    except Exception as e:
        print(f"  Error loading {file}: {e}")

# Concatenate all dataframes
df_full = pd.concat(dfs, ignore_index=True)

print(f"\n✓ Dataset loaded successfully!")
print(f"Total shape: {df_full.shape}")
print(f"Columns: {df_full.columns.tolist()}")

Found 16 CSV files
Loading ._Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
  Error loading dataset/__MACOSX/MachineLearningCVE/._Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
Loading ._Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
  Error loading dataset/__MACOSX/MachineLearningCVE/._Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
Loading ._Monday-WorkingHours.pcap_ISCX.csv...
  Error loading dataset/__MACOSX/MachineLearningCVE/._Monday-WorkingHours.pcap_ISCX.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
Loading ._Tuesday-WorkingHours.pcap_ISCX.csv...
  Error loading dataset/__MACOSX/MachineLearningCVE/._Tuesday-WorkingHours.pcap_ISCX.csv: 'utf-8' codec can't decode byte 0xa2 in position 37: invalid start byte
Loading ._Wednesday-workingHours.pcap_ISCX.csv...
  Error loading dataset/__MAC

In [3]:
# Check current memory usage
print(f"Memory usage: {df_full.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Optimize dtypes
def optimize_dtypes(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)

    return df

# Apply optimization
df_full = optimize_dtypes(df_full)
print(f"Optimized memory: {df_full.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Memory usage: 1833.87 MB
Optimized memory: 991.59 MB


In [4]:
# Save to avoid reloading
df_full.to_pickle('dataset_full.pkl')
print("✓ Dataset saved as 'dataset_full.pkl'")

# To load it later:
# df_full = pd.read_pickle('dataset_full.pkl')

✓ Dataset saved as 'dataset_full.pkl'


In [5]:
# Add this NEW cell right after your dataset_full.pkl was created
from google.colab import drive
drive.mount('/content/drive')

# Create a project folder
import os
project_folder = '/content/drive/MyDrive/ML_Attack_Classifier'
os.makedirs(project_folder, exist_ok=True)
print(f"✓ Project folder ready: {project_folder}")

# Copy your dataset to Google Drive
import shutil
shutil.copy('dataset_full.pkl', f'{project_folder}/dataset_full.pkl')
print("✓ Dataset copied to Google Drive!")

Mounted at /content/drive
✓ Project folder ready: /content/drive/MyDrive/ML_Attack_Classifier
✓ Dataset copied to Google Drive!
