# Data Analysis and Visualization

In [2]:
import numpy as np
from scipy import stats
import pandas as pd
from IPython.display import HTML, display

def print_colored(text, color):
    display(HTML(f"<span style='color: {color};'>{text}</span>"))

# Python Basics
## Task 1
Create a function which takes a `numpy` 1D array as input and returns the following (in the same order as listed):
- **Max** - Maximum value in the array
- **Std** - Measure of variation between the elements of an array
- **Sum** - Value obtained as a result of adding all the elements of an array
- **Dot Product** - Inner product of between the array and itself

In [50]:
def array_stats(array):
    array_max = float(np.max(array))  # Convert to Python float
    array_std = float(np.std(array))  # Convert to Python float
    array_sum = float(np.sum(array))   # Convert to Python float
    array_dot = float(np.dot(array, array))  # Convert to Python float
    
    return array_max, array_std, array_sum, array_dot

# Create a 1D array of 10 random numbers between 0 and 1
random_array = np.random.rand(10)
print_colored(random_array.tolist(), 'cyan')  # Convert to list for printing

myStats = array_stats(random_array)
print('Maximum: ', myStats[0]) 
print('Standard Deviation :', myStats[1])
print('Sum :', myStats[2])
print('Dot Product: ', myStats[3])

Maximum:  0.9363119298412041
Standard Deviation : 0.3422644047856356
Sum : 4.0599759381249845
Dot Product:  2.8197896896480388


## Task 2
Create a function that takes in two `numpy` 1D arrays and returns the `correlation` and `p-value` as a tuple

In [48]:
import numpy as np
from scipy import stats

def correlation(array1, array2):
    # Get the Pearson correlation coefficient and p-value
    pearsonr_stats = stats.pearsonr(array1, array2)
    # Return both the correlation coefficient and the p-value
    return float(pearsonr_stats[0]), float(pearsonr_stats[1])  # First element is the coefficient, second is the p-value

# Example usage
array1 = np.random.randn(10)
array2 = np.random.randn(10)

corr, p_value = correlation(array1, array2)

print('Array 1: ', array1, '\nArray 2: ', array2)
print()
print('Pearson Correlation Coefficient:', corr)
print('P-value:', p_value)

Array 1:  [ 0.16057627 -0.48223545  1.13387204  2.28222354  0.14971118  0.8723842
  0.26031131  1.14130541 -0.01143671  0.49968502] 
Array 2:  [-1.56673697  0.0117414  -0.61902115  0.83229815  0.10602984  0.99942727
  1.20600736 -1.49725913  0.22816905  0.02987403]

Pearson Correlation Coefficient: 0.08959772264571106
P-value: 0.80557081706302


# Reading Data
## Comma Separated Files

In [6]:
# Define the column names as a list
names = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum', 'martialstatus', 'occupation',
         'relationship', 'race', 'sex', 'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry', 'label']

# Generate sample data
np.random.seed(0)  # For reproducibility
data = {
    'age': np.random.randint(18, 70, size=100),  # Random ages between 18 and 70
    'workclass': np.random.choice(['Private', 'Self-emp', 'Gov', 'Federal', 'Local'], size=100),
    'fnlwgt': np.random.randint(10000, 1000000, size=100),  # Random final weight
    'education': np.random.choice(['Bachelors', 'Masters', 'HS-grad', 'Some-college', 'Doctorate'], size=100),
    'educationnum': np.random.randint(1, 16, size=100),  # Education level as a number
    'maritalstatus': np.random.choice(['Married', 'Single', 'Divorced', 'Widowed'], size=100),
    'occupation': np.random.choice(['Tech', 'Sales', 'Management', 'Clerical', 'Service'], size=100),
    'relationship': np.random.choice(['Husband', 'Wife', 'Not-in-family', 'Own-child', 'Unmarried'], size=100),
    'race': np.random.choice(['White', 'Black', 'Asian', 'Hispanic', 'Other'], size=100),
    'sex': np.random.choice(['Male', 'Female'], size=100),
    'capitalgain': np.random.randint(0, 10000, size=100),  # Random capital gain
    'capitalloss': np.random.randint(0, 1000, size=100),   # Random capital loss
    'hoursperweek': np.random.randint(1, 100, size=100),   # Random hours worked per week
    'nativecountry': np.random.choice(['United States', 'Canada', 'Mexico', 'India', 'China'], size=100),
    'label': np.random.choice(['<=50K', '>50K'], size=100)  # Random labels
}

# Create a DataFrame
df = pd.DataFrame(data, columns=names)

# Save to CSV
df.to_csv('sample_data.csv', index=False)

#print("Sample CSV file 'sample_data.csv' created successfully.")

# Read the CSV 
df = pd.read_csv("/workspaces/scribing/notebooks/data-analysis/sample_data.csv", header=None, names=names)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,martialstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,label
0,age,workclass,fnlwgt,education,educationnum,martialstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,label
1,62,Private,919528,HS-grad,4,,Tech,Own-child,Hispanic,Male,5802,70,8,China,<=50K
2,65,Self-emp,25741,Doctorate,7,,Service,Unmarried,Asian,Female,8058,21,52,United States,<=50K
3,18,Self-emp,853800,Doctorate,2,,Management,Wife,Other,Female,2489,734,74,Mexico,<=50K
4,21,Federal,104275,HS-grad,8,,Tech,Not-in-family,Black,Male,1169,707,88,Mexico,>50K


## JSON Files

In [13]:
import json

# Open the JSON file and load the data
with open('/workspaces/scribing/notebooks/data-analysis/data.json') as f:
    json_data = json.load(f)  # Load the JSON data directly into a Python dictionary

# Now you can print or manipulate json_data as needed
print(json_data)

{'glossary': {'title': 'example glossary', 'GlossDiv': {'title': 'S', 'GlossList': {'GlossEntry': {'ID': 'SGML', 'SortAs': 'SGML', 'GlossTerm': 'Standard Generalized Markup Language', 'Acronym': 'SGML', 'Abbrev': 'ISO 8879:1986', 'GlossDef': {'para': 'A meta-markup language, used to create markup languages such as DocBook.', 'GlossSeeAlso': ['GML', 'XML']}, 'GlossSee': 'markup'}}}}}


## Raw Files

In [19]:
import tempfile

tmp = tempfile.NamedTemporaryFile()

#Open the file for writing, and write the data
with open(tmp.name, 'w') as f:
    f.write("Allison|22|F\n")
    f.write("Lauren|31|F\n")
    f.write("Paul|25|M")
    
first_values = [] # Define a list to store the first values of each row

with open(tmp.name, "r") as f:  # Open the file to read
    for line in f:  # Loop over each line
        row_values = line.split("|") # Split each line by the | character into a list
        first_values.append(row_values[0])
        
print(first_values)

['Allison', 'Lauren', 'Paul']
