Plato workbook

In [1]:
import pandas as pd

from core.core import Core, load_data

In [2]:
core = Core()

Generating Data using core

Note: generated data is saved to generated_data.csv by default




In [None]:
core.generate_data(num_rows=100_000, columns=[
    {"name": "age", "data_type": "int", "options": {"min": 18, "max": 75}},
    {"name": "name", "data_type": "name"},
    {"name": "email", "data_type": "email"},
    {"name": "address", "data_type": "address"},
    {"name": "phone", "data_type": "phone"},
    {"name": "company", "data_type": "company"},
    {"name": "text", "data_type": "text", "options": {"max_nb_chars": 20000}},
    {"name": "income", "data_type": "float", "options": {"min": 10000, "max": 250000}},
    {"name": "salary", "data_type": "float", "options": {"min": 10000, "max": 300000}},
    {
        "name": "date_joined",
        "data_type": "date",
        "options": {"min": "2022-01-01", "max": "2022-12-31"}
    },
])

INFO:plato:Generating data...


Save generated data to SQLite database and query it

In [None]:
core.save_to_sqlite(db_path='data.db', table_name='generated_data')


In [None]:
results = core.query_data(query='SELECT * FROM generated_data WHERE age > 20 LIMIT 200')
print(results)

Loading data from SQLite database

In [None]:
df = load_data('generated_data.csv')
print(df.info())

In [None]:
from data_transformation.cleaner import DataCleaner

In [None]:
cleaner = DataCleaner(df)


In [None]:
cleaner.remove_duplicates()
cleaner.remove_outliers(['age', 'income'], "IQR", 1.5)
cleaner.convert_data_types(['date_joined'], 'datetime')
cleaner.get_cleaned_data().head()



In [None]:
cleaner.normalize_data(['age', 'income'])
cleaner.get_cleaned_data().head()

In [None]:
cleaner.standardize_data(['age', 'income'])
cleaner.get_cleaned_data().head()

In [None]:
df = cleaner.get_cleaned_data()


In [None]:
from data_transformation.transformer import DataTransformer

In [None]:
transformer = DataTransformer(df)

In [None]:
transformer.scale_data(['age', 'income'])


In [None]:
transformer.get_transformed_data().head()

In [None]:
transformer.log_transform(['age', 'income'])
transformer.get_transformed_data().head()

In [None]:
binned_age = transformer.bin_data('age', 3, ['young', 'middle-aged', 'old'])
binned_age.get_transformed_data().head()

In [None]:
# round income using custom function
def round_salary(x):
    return round(x, -3)
transformer.apply_custom_transform('salary', round_salary)
transformer.get_transformed_data().head()


In [None]:
from data_analysis.quant import QuantitativeAnalysis

In [None]:
quant = QuantitativeAnalysis(cleaner.get_cleaned_data())

In [None]:
quant.descriptive_statistics()

In [None]:
import matplotlib.pyplot as plt

# Visualization 3: Boxplot of 'income'
plt.figure(figsize=(6, 6))
plt.boxplot(df['income'])
plt.title('Income Boxplot')
plt.ylabel('Income')
plt.grid(True)
plt.show()

In [None]:
quant.correlation_matrix()

In [None]:
quant.plot_correlation_matrix()

In [None]:
quant.linear_regression('age', ['income', 'salary'])

In [None]:
quant.hypothesis_testing('age', 'income')

In [None]:
quant.plot_histogram('age')

In [None]:
quant.plot_histogram('income')

In [None]:
quant.plot_histogram('salary')

In [None]:
quant.plot_histogram('date_joined')

In [None]:
quant.plot_scatter('age', 'income')

In [None]:
from data_analysis.qual import QualitativeAnalysis

In [None]:
qual = QualitativeAnalysis(cleaner.get_cleaned_data())

In [None]:
qual.sentiment_analysis('text')

In [None]:
qual.generate_wordcloud('text')

In [None]:
qual.keyword_extraction('text')

In [None]:
from data_modeling.modeler import Modeler

In [None]:
modeler = Modeler(cleaner.get_cleaned_data())

In [None]:
X_train, X_test, y_train, y_test = modeler.train_test_split('income', ['age', 'salary'])

In [None]:
lr = modeler.linear_regression(X_train, X_test, y_train, y_test)

In [None]:
lr_predictions = lr.get('predictions')

In [None]:
rf = modeler.random_forest_regressor(X_train, X_test, y_train, y_test)

In [None]:
rf_predictions = rf.get('predictions')

In [None]:
rf_predictions

In [None]:
lr_predictions = pd.Series(lr_predictions)
rf_predictions = pd.Series(rf_predictions)
joined_predictions = pd.concat([lr_predictions, rf_predictions], axis=1)

In [None]:
# add original income values
joined_predictions['income'] = y_test
joined_predictions['age'] = X_test['age']
joined_predictions['salary'] = X_test['salary']

In [None]:
from data_modeling.visualizer import Visualizer

In [None]:
visualizer = Visualizer(joined_predictions)

In [None]:
visualizer.plot_correlation_matrix()


In [None]:
visualizer.plot_3d_scatter('age', 'salary', 'income', color='income')

In [None]:
visualizer.plot_distribution(column='income', color='gold', title='Income Distribution by Age')