My name is Orchlon (50291063). I plan to create my own structured dataset and experiment with it to become more familiar with the process.

In [28]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Define the data for 50 students
n_students = 50

# Generate Student IDs (integers from 10001 to 10050)
student_ids = list(range(10001, 10001 + n_students))

# Generate Scores (floats between 0.0 and 100.0)
scores = [round(random.uniform(0.0, 100.0), 2) for _ in range(n_students)]

# Define majors and randomly assign them
majors_list = ["Computer Science", "Mathematics", "Physics", "Chemistry", "Biology", 
               "Engineering", "Psychology", "Economics", "Literature", "History"]
majors = [random.choice(majors_list) for _ in range(n_students)]

# Generate Graduation Years (integers between 2024 and 2027)
graduation_years = [random.randint(2024, 2027) for _ in range(n_students)]

# Generate Pass/Fail (booleans - True if score >= 60, False otherwise)
pass_fail = [score >= 60.0 for score in scores]

print("Dataset created successfully!")
print(f"Total students: {len(student_ids)}")
print(f"Data types: Student_ID (int), Score (float), Major (str), Graduation_Year (int), Pass/Fail (bool)")


Dataset created successfully!
Total students: 50
Data types: Student_ID (int), Score (float), Major (str), Graduation_Year (int), Pass/Fail (bool)


In [30]:
# Create a dictionary with all the data
data = {
    'Student_ID': student_ids,
    'Score': scores,
    'Major': majors,
    'Graduation_Year': graduation_years,
    'Pass/Fail': pass_fail
}

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print("First 5 rows of the DataFrame:")
print(df.head(5))
print("\n" + "="*80 + "\n")

# Display the last few rows of the DataFrame
print("Last 5 5 of the DataFrame:")
print(df.tail(5))
print("\n" + "="*80 + "\n")

print(f"DataFrame dtypes:")
print(df.dtypes)


First 5 rows of the DataFrame:
   Student_ID  Score        Major  Graduation_Year  Pass/Fail
0       10001  63.94  Engineering             2025       True
1       10002   2.50  Engineering             2027      False
2       10003  27.50    Chemistry             2024      False
3       10004  22.32      Biology             2027      False
4       10005  73.65  Mathematics             2027       True


Last 5 5 of the DataFrame:
    Student_ID  Score             Major  Graduation_Year  Pass/Fail
45       10046  23.28         Economics             2025      False
46       10047  10.10       Mathematics             2025      False
47       10048  27.80  Computer Science             2026      False
48       10049  63.57       Mathematics             2027       True
49       10050  36.48           Physics             2026      False


DataFrame dtypes:
Student_ID           int64
Score              float64
Major               object
Graduation_Year      int64
Pass/Fail             bool
dtype

In [33]:
# Get only numeric columns and apply describe()
numeric_columns = df.select_dtypes(include=[np.number])
print("\nDescriptive statistics:")
print(numeric_columns.describe())

print("\n" + "="*80 + "\n")

# Find the number of unique values in categorical columns
print("UNIQUE VALUES IN CATEGORICAL COLUMNS:")

# Get categorical columns (object type)
categorical_columns = df.select_dtypes(include=['object'])

for column in categorical_columns.columns:
    unique_count = df[column].nunique()
    unique_values = df[column].unique()
    print(f"\nColumn: {column}")
    print(f"Number of unique values: {unique_count}")
    print(f"Unique values: {list(unique_values)}")


Descriptive statistics:
        Student_ID     Score  Graduation_Year
count     50.00000  50.00000        50.000000
mean   10025.50000  45.06760      2025.440000
std       14.57738  29.32915         1.109514
min    10001.00000   0.65000      2024.000000
25%    10013.25000  21.90500      2024.250000
50%    10025.50000  46.36500      2025.000000
75%    10037.75000  69.27500      2026.000000
max    10050.00000  97.31000      2027.000000


UNIQUE VALUES IN CATEGORICAL COLUMNS:

Column: Major
Number of unique values: 10
Unique values: ['Engineering', 'Chemistry', 'Biology', 'Mathematics', 'History', 'Physics', 'Literature', 'Economics', 'Psychology', 'Computer Science']


In [37]:
# Add a computed column: Convert Score to Letter Grade
def score_to_letter_grade(score):
    """
    Convert numeric score to letter grade
    A: 90-100
    B: 80-89
    C: 70-79
    D: 60-69
    F: Below 60
    """
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 70:
        return 'C'
    elif score >= 60:
        return 'D'
    else:
        return 'F'

# Apply the function to create the new column
df['Letter_Grade'] = df['Score'].apply(score_to_letter_grade)
print(f"\nDataFrame now has {len(df.columns)} columns: {list(df.columns)}")
print(df['Letter_Grade'].value_counts().sort_index())



DataFrame now has 6 columns: ['Student_ID', 'Score', 'Major', 'Graduation_Year', 'Pass/Fail', 'Letter_Grade']
Letter_Grade
A     2
B     7
C     3
D     7
F    31
Name: count, dtype: int64


In [39]:
# Save the DataFrame to a CSV file
csv_filename = 'student_data_with_grades.csv'
df.to_csv(csv_filename, index=False)

print(f"DataFrame saved to '{csv_filename}' successfully!")
print(f"\nFile contains {len(df)} rows and {len(df.columns)} columns")
print(f"Columns: {', '.join(df.columns)}")
print(f"\nThe Letter_Grade column is a computed column based on Score values.")

# Verify the file was created by reading it back
df_verify = pd.read_csv(csv_filename)
print(f"\n Verification: Successfully read {len(df_verify)} rows from the CSV file")


DataFrame saved to 'student_data_with_grades.csv' successfully!

File contains 50 rows and 6 columns
Columns: Student_ID, Score, Major, Graduation_Year, Pass/Fail, Letter_Grade

The Letter_Grade column is a computed column based on Score values.

 Verification: Successfully read 50 rows from the CSV file
