In [22]:
import pandas as pd
from DataUtil import *


In [23]:
df = pd.read_csv(r"C:\Users\ammar\OneDrive\Desktop\PracticeData\StudentPerformanceFactors.csv")

In [24]:
dt = DataTools(df)

In [25]:
def quick_test():
    """
    Quick sanity test for DataTools.
    Creates a small DataFrame and runs core methods once.
    """

    df = pd.DataFrame({
        "age": [20, 25, np.nan, 30, 30],
        "salary": [300, 400, 500, np.nan, 400],
        "dept": ["IT", "IT", "HR", "HR", "HR"],
        "name": [" Ali ", "ALI", "Sara!", "Omar", "Omar"]
    })

    print("=== Original Data ===")
    print(df)

    dt = DataTools(df)

    print("\n=== Overview ===")
    summary, info = dt.overview()
    print(summary)
    print(info)

    print("\n=== Fill Missing (median) ===")
    print(dt.fillMissingValues("median", inplace=False))

    print("\n=== Drop Duplicates (dept + salary) ===")
    print(dt.dropDuplicates(columns=["dept", "salary"], inplace=False))

    print("\n=== IQR Outliers ===")
    print(dt.detect_outliers_iqr())

    print("\n=== Z-score Outliers ===")
    print(dt.detect_outliers_zscore())

    print("\n=== Encoding (One-Hot) ===")
    print(dt.oneHotEncode(inplace=False))

    print("\n=== Scaling (standard) ===")
    print(dt.scale("standard", inplace=False))

    print("\n=== Feature Combination ===")
    print(dt.combineFeatures("double_salary", "salary * 2", inplace=False))

    print("\n=== Text Cleaning ===")
    print(dt.cleanText(["name"], remove_punct=True, inplace=False))

    print("\n=== TEST COMPLETED SUCCESSFULLY ===")


In [26]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [27]:
from DataSetGenerator import *

d = DataSetGenerator.generate_test_dataset()

In [28]:
df = DataTools(d)

In [29]:
d.columns

Index(['age', 'salary', 'rating', 'department', 'city', 'name', 'review_text',
       'join_date'],
      dtype='object')

In [30]:
df.fillMissingValues(strategy="mean",columns='salary',inplace=True)

Unnamed: 0,age,salary,rating,department,city,name,review_text,join_date
0,22.0,3252.128057,2.0,Sales,Irbid,Khaled??,Great product!!!,2022-04-16
1,54.0,2516.773706,5.0,HR,Irbid,Omar!!,BAD quality,2023-09-09
2,48.0,3252.128057,3.0,Marketing,Aqaba,Ali,Excellent!!,2023-09-29
3,38.0,2980.931176,4.0,IT,Madaba,Lina,BAD quality,2022-05-01
4,38.0,2618.237725,2.0,Marketing,Amman,Lina,BAD quality,2021-04-17
...,...,...,...,...,...,...,...,...
52495,24.0,4755.529458,2.0,Marketing,Zarqa,Lina,Worth the price,2018-03-06
52496,30.0,3366.446283,3.0,IT,Madaba,SARA,Worth the price,2023-03-23
52497,20.0,3950.880720,5.0,Sales,Irbid,Khaled??,Excellent!!,2018-04-25
52498,,3723.607947,1.0,IT,Aqaba,Khaled??,BAD quality,2023-08-31


In [31]:
df.to_int(column='salary',inplace=True)

Unnamed: 0,age,salary,rating,department,city,name,review_text,join_date
0,22.0,3252,2.0,Sales,Irbid,Khaled??,Great product!!!,2022-04-16
1,54.0,2516,5.0,HR,Irbid,Omar!!,BAD quality,2023-09-09
2,48.0,3252,3.0,Marketing,Aqaba,Ali,Excellent!!,2023-09-29
3,38.0,2980,4.0,IT,Madaba,Lina,BAD quality,2022-05-01
4,38.0,2618,2.0,Marketing,Amman,Lina,BAD quality,2021-04-17
...,...,...,...,...,...,...,...,...
52495,24.0,4755,2.0,Marketing,Zarqa,Lina,Worth the price,2018-03-06
52496,30.0,3366,3.0,IT,Madaba,SARA,Worth the price,2023-03-23
52497,20.0,3950,5.0,Sales,Irbid,Khaled??,Excellent!!,2018-04-25
52498,,3723,1.0,IT,Aqaba,Khaled??,BAD quality,2023-08-31


In [32]:
df.cleanText(columns="name",remove_punct=True)

Unnamed: 0,age,salary,rating,department,city,name,review_text,join_date
0,22.0,3252,2.0,Sales,Irbid,khaled,Great product!!!,2022-04-16
1,54.0,2516,5.0,HR,Irbid,omar,BAD quality,2023-09-09
2,48.0,3252,3.0,Marketing,Aqaba,ali,Excellent!!,2023-09-29
3,38.0,2980,4.0,IT,Madaba,lina,BAD quality,2022-05-01
4,38.0,2618,2.0,Marketing,Amman,lina,BAD quality,2021-04-17
...,...,...,...,...,...,...,...,...
52495,24.0,4755,2.0,Marketing,Zarqa,lina,Worth the price,2018-03-06
52496,30.0,3366,3.0,IT,Madaba,sara,Worth the price,2023-03-23
52497,20.0,3950,5.0,Sales,Irbid,khaled,Excellent!!,2018-04-25
52498,,3723,1.0,IT,Aqaba,khaled,BAD quality,2023-08-31


In [33]:
df = DataSetGenerator.menu()


Dataset Generator Menu
----------------------
1) General Testing Dataset
2) Customer Churn Dataset
3) Fake Reviews Dataset
4) Sales Orders Dataset
5) Edge Cases Dataset
0) Exit


KeyboardInterrupt: Interrupted by user