In [None]:
#import packages and show their versions
# jupyter notebook version '6.4.3'
# python version '3.8.12'
import numpy as np # '1.19.2'
import pandas as pd # '1.3.3'
import urllib.request # '3.8'
import bs4 as bs # '4.10.0'
import matplotlib as mpl # '3.4.3'
mpl.use('Qt5Agg')
import matplotlib.pyplot as plt
from scipy import stats # '1.6.2'
import statsmodels.formula.api as smf # '0.12.2'
from sklearn.model_selection import train_test_split # '0.23.1'
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor # '1.3.3'
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from pandas_profiling import ProfileReport # '3.1.0'
from chart_studio import plotly as pl # '1.1.0'
import plotly.offline as po # '5.1.0'
import plotly.express as px
import cufflinks as cf # '0.17.3'
import warnings 
warnings.filterwarnings("ignore")

In [None]:
po.init_notebook_mode(connected = True)
cf.go_offline()

### The next two cells create the data type as the user needs, first choice offers her/him shuffling data to practise, second choice offers her/him to enter her/his own data if it is not in a file, third choise to import file from her/his device, last one to import tables from url, (and surely there is more to add like importing from databases using SQL or NoSQL languages)

In [None]:
def create_data(data):
    if data == 1:
        x = np.random.randint(0, 1000000, 10000).reshape(2000,5)
        dataframe = pd.DataFrame(x, columns=['A','B','C','D','E'])
                              
    elif data == 2:
        x = []
        row = []
        
        col_num = int(input('Please enter the number of columns: '))
        print('Enter the names of columns: ')
        for i in range(col_num):
            x.append(input())
        
        row_num = int(input('Please enter the number of rows: '))
        for i in range(col_num):
            temp = []
            for j in range(row_num):
                print(f'Enter the {j+1} value for {i+1} column: ')
                # here we do not use int(input()) in case the user wants to enter string or date values
                # and offering her/him later to cast the numeric columns if she/he needed.
                temp.append(input())
            row.append(temp)
        print()
        print('Done with your Data!')        
        dataframe = pd.DataFrame({k:v for k,v in zip(x, row)})
                              
    elif data == 3:
        file = input('Enter the file name: ')
        try:
            dataframe = pd.read_csv(file, parse_dates=True)
        except:
            dataframe = print('Please check your file name/directory and try again!')
                              
    elif data == 4:
        file = input('Enter Webpage url: ')
        try:
            source = urllib.request.urlopen(file).read().decode('utf-8')
            soup = bs.BeautifulSoup(source,'lxml')
            table = soup.find_all('table')
            x = pd.read_html(str(table))[0]
            dataframe = pd.DataFrame(x)
            dataframe.to_csv('my_file.csv', encoding='utf-8')
        except:
            dataframe = print('Please enter a valid url for table scraping')
                              
    else:
        dataframe = print('DataFrame failed! please enter number between 1 and 4 and try again!')
                                  
    return dataframe

## Run the next cell to create your Data

In [None]:
print('Select the type of your data (By enter 1, 2 or 3): ')
print('1. Random data with 2000 rows and 5 columns to play and learn!: ')
print('2.Customize your dataframe as you need (please make every column of the same data type): ')
print('3.Upload csv/json/txt file: ')
print('4.Enter url: ')
try:
    data = int(input('Wht is your choice?: '))
    dataframe = create_data(data)
except:
    print('Please enter a number between 1-4 and try again!')
print()
print()


print('DataFrame information is given below!\n')
try:
    dataframe.info()
except:
    print('Please check if you create the data!')

## The next cell to show first five rows of Data to give an idea to the user about her/his data and if she/he needs to make any modifications

In [None]:
print('First five rows of your DataFrame is given below!')
dataframe.head()

## The next cell offers to the user if she/he needs to cast any columns she/he wants to numeric values or date after she/he checked the head of data.

In [None]:
print('Do you need to convert any column to numeric?\nif no just press 0')
try:
    x = int(input('How many columns to cast to numeric values?: '))
    for i in range(x):
        y = input('Please enter the column name for casting: ')
        dataframe[y] = dataframe[y].astype(float)
        print('Done!')
except:
    print('Please enter a valid entry!')
    
print()
print()
print('Do you need to convert any column to Date type?\nif no just press 0')
try:
    x = int(input('How many columns to cast to date values?: '))
    for i in range(x):
        y = input('Please enter the column name for casting: ')
        dataframe[y] = pd.to_datetime(dataframe[y])
        print('Done!')
except:
    print('Please enter a valid entry!')

## Run the next cell to display your statistics summary

In [None]:
print('DataFrame statistics summary is given below check them\n')
dataframe.describe()

## The next two cells contain main function to produce summary report for the whole data

In [None]:
def create_report(report):
    if report == 1:
        finalreport = ProfileReport(dataframe, title="Summary Report", explorative=True, minimal=True, 
                                  plot={"dpi": 200, "image_format": "png"},)
        finalreport.to_notebook_iframe()
        finalreport.to_file("your_report.html")
        
    elif report == 2:
        finalreport = ProfileReport(dataframe, title="Summary Report", explorative=True,  
                                  plot={"dpi": 200, "image_format": "png"},)
        finalreport.to_notebook_iframe()
        finalreport.to_file("your_report.html")
        
    else:
        finalreport = print('Please enter 1 or 2 and try again!')
    
    return  finalreport

## Run the next cell to create your summary report

In [None]:
print('1.Big Data')
print('2.Small Data')

try:
    report = int(input('Select the type of your Data: '))
    create_report(report)
except:
    print('Please check your enteries and try again!')

## The next two cells contain the codes for plotting, visualizing main function and offer what kind of plots user wants, the user can export the plot to plot.ly website and edit it as she/he likes.

In [None]:
def visualize1(plot):
    if plot == 1:
        finalplot = dataframe.iplot(kind='scatter')
        
    elif plot == 2:
        finalplot = dataframe.iplot(kind='scatter', mode='markers', symbol='x', colorscale='paired')
        
    elif plot == 3:
        finalplot = dataframe.iplot(kind='bar')
        
    elif plot == 4:
        finalplot = dataframe.iplot(kind='hist')
        
    elif plot == 5:
        finalplot = dataframe.iplot(kind='box')
        
    elif plot == 6:
        finalplot = dataframe.iplot(kind='surface')
        
    elif plot == 7:
        corr = dataframe.corr()
        corr.style.background_gradient(cmap='coolwarm').set_precision(2)
        finalplot = px.imshow(corr)
        finalplot.show()
        
    else:
        finalplot = print('Select only between 1 to 7!')
        
    return finalplot

def visualize2(plot):
    col = int(input('Enter the number of columns you want to plot by selecting only 1 , 2 or 3: '))
    
    if col == 1:
        colm = input('Enter the column you want to plot by selecting any column from dataframe head: ')
        if plot==1:
            finalplot = dataframe[colm].iplot(kind='scatter')
        elif plot==2:
            finalplot = dataframe[colm].iplot(kind='scatter', mode='markers', symbol='x', colorscale='paired')
        elif plot==3:
            finalplot = dataframe[colm].iplot(kind='bar')
        elif plot==4:
            finalplot = dataframe[colm].iplot(kind='hist')
        elif plot==5:
            finalplot = dataframe[colm].iplot(kind='box')
        elif plot==6:
            finalplot = px.violin(dataframe, y=colm, box=True, points='all')
            finalplot.show()
        elif plot==7 or plot==8:
            finalplot = print('Bubble plot and surface plot require more than one column arguments!')
        else:
            finalplot = print('Select only between 1 to 6!')
            
    elif col == 2:
        print('Enter the columns you want to plot by selecting from dataframe head: ')
        x = input('First column: ')
        y = input('Second column: ')
        
        if plot==1:
            finalplot = dataframe[[x,y]].iplot(kind='scatter')
        elif plot==2:
            finalplot = dataframe[[x,y]].iplot(kind='scatter', mode='markers', symbol='x', colorscale='paired')
        elif plot==3:
            finalplot = dataframe[[x,y]].iplot(kind='bar')
        elif plot==4:
            finalplot = dataframe[[x,y]].iplot(kind='hist')
        elif plot==5:
            finalplot = dataframe[[x,y]].iplot(kind='box')
        elif plot==6:
            finalplot = px.data.tips()
            finalplot = px.violin(dataframe, y=dataframe[y], x=dataframe[x], box=True, points='all', hover_data=dataframe.columns)
            finalplot.show()   
        elif plot==7:
            finalplot = dataframe[[x,y]].iplot(kind='surface')
        else:
            finalplot = print('Please only select a number between 1 to 7!')
        
        
    elif col == 3:
        print('Enter the columns you want to plot: ')
        x = input('First column: ')
        y = input('Second column: ')
        z = input('Third column: ')
        
        if plot==1:
            finalplot = dataframe[[x,y,z]].iplot(kind='scatter')
        elif plot==2:
            finalplot = dataframe[[x,y,z]].iplot(kind='scatter', mode='markers', symbol='x', colorscale='paired')
        elif plot==3:
            finalplot = dataframe[[x,y,z]].iplot(kind='bar')
        elif plot==4:
            finalplot = dataframe[[x,y,z]].iplot(kind='hist')
        elif plot==5:
            finalplot = dataframe[[x,y,z]].iplot(kind='box')
        elif plot==6:
            finalplot = print('Violin plot require just one or two cloumns not three!')
        elif plot==7:
            finalplot = dataframe[[x,y,z]].iplot(kind='surface')
        else:
            finalplot = print('Please only select a number between 1 to 7!')
            
    else:
        finalplot = print('Please enter only 1 , 2 or 3!')
    
    return finalplot


def create_visualizing(beta):
    if beta == 1:
        print('Enter a number between 1 and 6 to select the type of plot you need: ')
        print('1.Line plot')
        print('2.Scatter plot')
        print('3.Bar plot')
        print('4.Histogram')
        print('5.Box plot')
        print('6.Surface plot')
        print('7.Heat map')
        plot = int(input())
        output = visualize1(plot)
        
    elif beta == 2:
        print('Enter a number between 1 and 8 to select the type of plot you need: ')
        print('1.Line plot')
        print('2.Scatter plot')
        print('3.Bar plot')
        print('4.Histogram')
        print('5.Box plot')
        print('6.Violin plot')
        print('7.Surface plot')
        plot = int(input())
        output = visualize2(plot)
        
    else:
        print('Please enter 1 or 2 and try again!')

## Run the next cell to create your plots

In [None]:
print('What kind of plot you need? the complete plot analysis or by columns?\n')
try:
    beta = int(input('Press 1 for plotting all columns "not recommended for big data" or press 2 for specifying columns to plot: '))
except:
    print('Please enter 1 or 2 and try again!')
           
try:
    create_visualizing(beta)
except:
    print('Please check your enteries and the names of your columns and try again!')

## The next two cells contain the code of machine learning and main function

In [None]:
def create_ML(gamma):
    #offer to the user to use a linear regression model
    if gamma == 1:
        print('Enter two columns you want by selecting from statistics summary table: ')
        x = input('First column (explantory variable): ')
        y = input('Second column (dependent variable): ')
        dataframe.dropna()
        explantory_variable = dataframe[x]
        dependent_variable = dataframe[y]
        slope, intercept, r, p, std_err = stats.linregress(explantory_variable, dependent_variable)

        def myfunc(explantory_variable):
            return slope * explantory_variable + intercept

        mymodel = list(map(myfunc, explantory_variable))
        model = smf.ols('dependent_variable ~ explantory_variable', data = dataframe)
        results = model.fit()
        plt.switch_backend('Qt5Agg')
        plt.scatter(explantory_variable, dependent_variable)
        plt.plot(explantory_variable, mymodel)
        plt.savefig('my_model.png')
        print()
        print("(correlation = ", r, ")   ", " (P-value = ", p, ")   ", " (standard error = ", std_err, ")")
        print()
        print(results.summary())
        print()
        alpha = print("your model picture has been downloaded successfully!")
    
    #offer to the user to use random forest model to use more than two columns if she/he needed to
    elif gamma == 2:
        print('Enter three columns you want by selecting from statistics summary table: ')
        dataframe.dropna()
        feauture_dict = {}
        try:
            a = int(input('How many columns you want to use in predicting process?: '))
            for i in range(a):
                b = input(f'Please enter the {i+1} column name to use it for predicting: ')
                inputs = float(input(f'Enter the {i+1} values to use it for predicting target value! '))
                feauture_dict[b] = inputs
            print('Done for the feautures!')
        except:
            print('Please enter a valid entry!')
        
        try:
            c = input('Please enter the column name you want to predict: ')
            print('Done! check your result!')
        except:
            print('Please enter a valid entry!')
            
        X = dataframe[[*feauture_dict.keys()]]
        y = dataframe[c]
        train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 0)
        forest_model = RandomForestRegressor(random_state=1)
        forest_model.fit(train_X, train_y)
        predicted = forest_model.predict(val_X)
        print('mean_absolute_error = ', mean_absolute_error(val_y, predicted))
        rf_model_on_full_data = RandomForestRegressor(random_state=1)
        rf_model_on_full_data.fit(X,y)
        value_pred = [*feauture_dict.values()]
        test_preds = rf_model_on_full_data.predict([value_pred])
        feauture_dict[c] = test_preds
        result = pd.DataFrame(feauture_dict)
        print(result)
        result.to_csv('show_pred.csv', index = False)
        print()
        alpha = print('your predictions saved to your computer successfully!')
        
        # offer to the user if she/he wants to upload a file instead of predict a single value
        decision = int(input('If you want to predict a file please enter (1) if not press(0) to exit: '))
        if decision == 0:
            print('Done with your Machine Learning Model!')
        elif decision == 1:
            try:
                file = input('Plese input your file name and directory: ')
                del feauture_dict[c]
                df = pd.read_csv(file)
                df.dropna(axis=0, how='any', inplace=True)
                df_feautures = df[[*feauture_dict.keys()]]
                target_preds = rf_model_on_full_data.predict(df_feautures)
                result = pd.DataFrame({c: target_preds})
                result = result.set_index(df.index)
                result.to_csv('show_pred_file.csv')
                print()
                alpha = print('your predictions saved to your computer successfully!')
            except:
                print('Please check the name of your file/directory and try again!')
        else:
            print('Please enter just 1 or 2')
               
    else:
        alpha = print('Please only enter 1 or 2 and try again!')
        
    return alpha

## Run the next cell to create your machine learning model

In [None]:
print('What kind of machine learning you need?\nPlease enter numeric values only\n ')
try:
    gamma = int(input('Press 1 for Linear Regression (two columns), Press 2 for Random Forest: '))
    try:
        create_ML(gamma)
    except:
        print('Please check the names of your columns and try again!')
except:
    print('Please enter 1 or 2 and try again!')

## The next cell contains XGBoost code

In [None]:
def create_XGB(xgb):
    #offer to the user to use XGBoost model
    if xgb == 1:
        try:
            c = input('Please enter the column name you want to predict: ')
        except:
            print('Please enter a valid entry!')

        try:
            test_file = input('Plese input your file name and directory for test file: ')
            df = pd.read_csv(test_file, parse_dates=True)
        except:
            print('Please check the name of your file/directory and try again!')
                
        # Read the data
        try:
            file = input('Plese input your file name and directory for train file: ')
            dataframe = pd.read_csv(file, parse_dates=True)
        except:
            print('Please check the name of your file/directory and try again!')
            
        X = dataframe
        X_test_full = df

        # Remove rows with missing target, separate target from predictors
        X.dropna(axis=0, subset=[c], inplace=True)
        y = X[c]              
        X.drop([c], axis=1, inplace=True)

        # Break off validation set from training data
        X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
            
        # Select categorical columns with relatively low cardinality
        low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

        # Select numeric columns
        numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

        # Keep selected columns only
        my_cols = low_cardinality_cols + numeric_cols
        X_train = X_train_full[my_cols].copy()
        X_valid = X_valid_full[my_cols].copy()
        X_test = X_test_full[my_cols].copy()

        # One-hot encode the data (to shorten the code, we use pandas)
        X_train = pd.get_dummies(X_train)
        X_valid = pd.get_dummies(X_valid)
        X_test = pd.get_dummies(X_test)
        X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
        X_train, X_test = X_train.align(X_test, join='left', axis=1)
            
            
        # Define the model
        my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)

        # Fit the model
        my_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid_full, y_valid)], verbose=False)

        # Get predictions
        predictions = my_model.predict(X_valid)
        print('Predictions = ', predictions)
        
        # Calculate MAE
        mae = mean_absolute_error(predictions, y_valid)
        print("Mean Absolute Error:" , mae)
        
        # save in json format
        my_model.save_model("model_sklearn.json")
        
        print()
        xgb = print('your model saved to your computer successfully!')
        
    elif xgb == 0:
        xgb = print('see you soon')
        
    else:
        xgb = print('Please only enter a valid values and try again!')
        
    return xgb

## Run the next cell to create XGBoost model for your data

In [None]:
try:
    xgb = int(input('Press 1 for XGBoost Regression or Press 0 to exit: '))
    try:
        create_XGB(xgb)
    except:
        print('Please enter a valid value and try again!')
except:
    print('Please enter 1 or 0 and try again!')