# Exploratory Data Analysis general


### Loading and Exploring the data

In [6]:
## Data Load 
import pandas as pd

# Load the data from the CSV file
file_path = r"C:\Users\MicrobeJ\Downloads\data.csv"
data = pd.read_csv(file_path)

# Check the structure of the dataset (rows and columns)
data_shape = data.shape

# Identify the different data types (classes) present in each column
data_types = data.dtypes

# Calculate the number of unique values in each column, excluding the header
unique_values_per_column = data.nunique()

# Organizing the information into a dataframe for better readability
data_structure_info = pd.DataFrame({
    'Data Types': data_types,
    'Unique Values': unique_values_per_column
})

print(data_shape, data_structure_info)

# Checking for columns with 0 unique values to see if they contain only NaNs or a constant value
empty_or_constant_columns = data.loc[:, data.nunique() == 0]

# Checking for any missing values across the dataset
missing_values = data.isnull().sum()

# Checking for any values that are exactly 0 across the dataset
zero_values = (data == 0).sum()

# Combine the information into a dataframe for better readability
missing_zero_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Zero Values': zero_values
})

print(empty_or_constant_columns, missing_zero_info)




(485, 59)                        Data Types  Unique Values
NAME                       object            485
NAME.id                    object            485
NAME.name                  object            485
ASSOCIATION                 int64              1
CLUMP                       int64              1
EXPERIMENT                 object              1
EXPERIMENT.count            int64              1
EXPERIMENT.date            object              1
EXPERIMENT.description    float64              0
EXPERIMENT.field            int64              1
EXPERIMENT.fullname        object              1
EXPERIMENT.id             float64              0
EXPERIMENT.index          float64              0
EXPERIMENT.name            object              1
EXPERIMENT.series           int64              1
IMAGE                      object              1
IMAGE.label                object              5
IMAGE.meta                 object              5
IMAGE.name                 object              1
INTENSITY 

In [7]:
# Making of the subset of the data only pulling possible useful varaibles

# Creating a subset of the data with the specified columns
subset_columns = [
    'NAME.id', 'NAME.name', 'EXPERIMENT.count', 'IMAGE.meta', 'IMAGE.name', 
    'INTENSITY.ch1', 'INTENSITY.ch2', 'INTENSITY.ch3', 'LOCATION', 'LOCATION.center', 
    'LOCATION.dist', 'LOCATION.half', 'LOCATION.pole', 'LOCATION.side', 'LOCATION.x', 
    'LOCATION.y', 'MAXIMA', 'MAXIMA.Maxima1', 'MAXIMA.count', 'MEDIAL', 'POSITION', 
    'POSITION.channel', 'POSITION.frame', 'POSITION.position', 'POSITION.slice', 
    'PROFILE_MEDIAL', 'SHAPE', 'SHAPE.angularity', 'SHAPE.area', 'SHAPE.aspectRatio', 
    'SHAPE.circularity', 'SHAPE.curvature', 'SHAPE.feret', 'SHAPE.length', 
    'SHAPE.morphology', 'SHAPE.orientation', 'SHAPE.perimeter', 'SHAPE.pole', 
    'SHAPE.roundness', 'SHAPE.sinuosity', 'SHAPE.solidity', 'SHAPE.width', 'ZSCORE'
]

# Selecting the columns from the dataset
data_subset = data[subset_columns]

# Display the first few rows of the subset to confirm
data_subset.head()




Unnamed: 0,NAME.id,NAME.name,EXPERIMENT.count,IMAGE.meta,IMAGE.name,INTENSITY.ch1,INTENSITY.ch2,INTENSITY.ch3,LOCATION,LOCATION.center,...,SHAPE.length,SHAPE.morphology,SHAPE.orientation,SHAPE.perimeter,SHAPE.pole,SHAPE.roundness,SHAPE.sinuosity,SHAPE.solidity,SHAPE.width,ZSCORE
0,fcd1aa01-d08f-4831-9504-72af78844077,b1,97,c:1/3 z:1/5 t:6/16 - 20230801_LZ22225_60min_in...,20230801_LZ22225_60min_inf.002.tif - T=5,Stats[mean=14197.771 min=9515.0 max=19888.0 st...,Stats[mean=18853.219 min=8898.0 max=27483.0 st...,Stats[mean=4635.9297 min=2458.0 max=6031.0 std...,(x=38.80 y=1.19),False,...,3.122824,1,40.3684,6.616892,2.0,0.175224,1.004721,0.975574,0.451076,3.433621
1,64d8162e-e9ea-4850-91fb-043bfd5caf3f,b2,97,c:1/3 z:1/5 t:6/16 - 20230801_LZ22225_60min_in...,20230801_LZ22225_60min_inf.002.tif - T=5,Stats[mean=13454.421 min=10518.0 max=20259.0 s...,Stats[mean=9703.965 min=7326.0 max=11510.0 std...,Stats[mean=8178.2104 min=4550.0 max=10558.0 st...,(x=44.02 y=0.70),False,...,1.457628,1,19.025343,3.419086,2.0,0.395642,1.0,0.996051,0.516315,2.841504
2,fea5b92e-852a-40c3-b7fd-1ea41c69f3f0,b3,97,c:1/3 z:1/5 t:6/16 - 20230801_LZ22225_60min_in...,20230801_LZ22225_60min_inf.002.tif - T=5,Stats[mean=15142.33 min=10027.0 max=21852.0 st...,Stats[mean=11617.922 min=6859.0 max=30524.0 st...,Stats[mean=7309.2 min=3403.0 max=10953.0 stdv=...,(x=44.74 y=0.89),False,...,1.923659,1,30.65048,4.804621,2.0,0.469209,1.0,0.943272,0.823764,2.976341
3,a221eb7d-ae56-48d8-86bb-bf63a372c3f9,b4,97,c:1/3 z:1/5 t:6/16 - 20230801_LZ22225_60min_in...,20230801_LZ22225_60min_inf.002.tif - T=5,Stats[mean=11937.072 min=9062.0 max=17243.0 st...,Stats[mean=8345.31 min=6825.0 max=9827.0 stdv=...,Stats[mean=5861.5454 min=2871.0 max=7928.0 std...,(x=3.01 y=0.87),False,...,1.405479,1,145.78427,3.301511,2.0,0.40553,1.0,0.998628,0.501003,3.758874
4,9e917210-8499-4918-9171-741008d64fb0,b5,97,c:1/3 z:1/5 t:6/16 - 20230801_LZ22225_60min_in...,20230801_LZ22225_60min_inf.002.tif - T=5,Stats[mean=11153.036 min=9184.0 max=15266.0 st...,Stats[mean=9310.5 min=7451.0 max=11066.0 stdv=...,Stats[mean=6719.452 min=4207.0 max=9343.0 stdv...,(x=23.14 y=2.59),False,...,2.102353,1,29.32732,4.645057,2.0,0.292261,1.014611,0.99088,0.513725,3.733687


The IMAGE.meta column entries appear to encode the Z stack information with a pattern c:x/y z:x/y t:x/y, where c represents the channel, z the Z stack level, and t the time point within the metadata string. We will focus on extracting the Z stack level (z:x/y) for our function. 

In [None]:
# A general Remapping/Renaming for Data Extraction
def extract_z_stack(meta_string):
    """Extract the Z stack level from the IMAGE.meta string."""
    match = re.search(r'z:(\d+)/(\d+)', meta_string)
    return match.group(1) if match else None

def rename_z_stack(data, meta_col, z_stack_col, new_name_mapping):
    """Rename the Z stack levels based on metadata information.

    Args:
        data (DataFrame): The pandas DataFrame containing the data.
        meta_col (str): The column name of the metadata.
        z_stack_col (str): The column name of the Z stack levels.
        new_name_mapping (dict): A dictionary mapping the original Z stack levels to new names.

    Returns:
        DataFrame: The pandas DataFrame with the Z stack levels renamed.
    """
    # Extract the Z stack levels
    data[z_stack_col] = data[meta_col].apply(extract_z_stack)
    
    # Map the Z stack levels to new names using the provided mapping
    data[z_stack_col] = data[z_stack_col].map(new_name_mapping)
    
    return data

# Since we don't have specific names to map, let's create an example mapping
# This mapping will be placeholder until we know the actual naming scheme you would like
example_new_name_mapping = {
    '1': 'Initial',
    '2': 'Mid-stack',
    '3': 'End'
}

# Apply the renaming function to the dataset
renamed_data = rename_z_stack(data_subset.copy(), 'IMAGE.meta', 'Z_Stack', example_new_name_mapping)

# Check the first few rows to confirm the renaming
renamed_data[['IMAGE.meta', 'Z_Stack']].head()


## Violin Plot the Intensities

In [None]:
#### Violin Plots Labeled by condition and time groupings 

# Plotting the violin plots with individual data points colored by 'time' and marker style based on 'condition'
plt.figure(figsize=(14, 6))

# Plot for mean
plt.subplot(1, 2, 1)
sns.violinplot(x='z_stack', y='mean', data=df_background_only, inner=None, color='lightgray')
sns.stripplot(x='z_stack', y='mean', data=df_background_only, hue='time', dodge=True,
              marker='o', alpha=0.5, edgecolor='gray',
              palette=sns.color_palette("hsv", len(df_background_only['time'].unique())))
# Modify markers based on condition
for condition in df_background_only['condition'].unique():
    subset = df_background_only[df_background_only['condition'] == condition]
    marker = 'x' if condition == 'inf' else 'o'
    sns.stripplot(x='z_stack', y='mean', data=subset, hue='time', dodge=True, 
                  marker=marker, alpha=0.5, edgecolor='gray',
                  palette=sns.color_palette("hsv", len(subset['time'].unique())))

plt.title('Mean Pixel Intensity by Z-stack')
plt.legend(title='Time', bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot for standard deviation
plt.subplot(1, 2, 2)
sns.violinplot(x='z_stack', y='std_dev', data=df_background_only, inner=None, color='lightgray')
sns.stripplot(x='z_stack', y='std_dev', data=df_background_only, hue='time', dodge=True,
              marker='o', alpha=0.5, edgecolor='gray',
              palette=sns.color_palette("hsv", len(df_background_only['time'].unique())))
# Modify markers based on condition
for condition in df_background_only['condition'].unique():
    subset = df_background_only[df_background_only['condition'] == condition]
    marker = 'x' if condition == 'inf' else 'o'
    sns.stripplot(x='z_stack', y='std_dev', data=subset, hue='time', dodge=True, 
                  marker=marker, alpha=0.5, edgecolor='gray',
                  palette=sns.color_palette("hsv", len(subset['time'].unique())))

plt.title('Standard Deviation of Pixel Intensity by Z-stack')
plt.legend(title='Time', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()



In [10]:
import regex as re

# Define a function to parse the 'IMAGE.name' column and extract the specified components
def parse_image_name(image_name):
    # Parse the date (first 8 characters are the date in YYYYMMDD format)
    date = image_name[:8]
    
    # Use a regular expression to find the strain identifier, which seems to be in the format LZ followed by numbers
    strain_match = re.search(r'(LZ\d+)', image_name)
    strain = strain_match.group(1) if strain_match else None
    
    # Use a regular expression to find the time point (number followed by 'min')
    time_match = re.search(r'(\d+)min', image_name)
    time = int(time_match.group(1)) if time_match else None
    
    # Use a regular expression to find the condition ('inf' or similar pattern)
    cond_match = re.search(r'min_([a-zA-Z]+)', image_name)
    cond = cond_match.group(1) if cond_match else None
    
    # Use a regular expression to find the frame number (after 'T=')
    frame_match = re.search(r'T=(\d+)', image_name)
    frame = int(frame_match.group(1)) if frame_match else None
    
    return date, strain, time, cond, frame

# Apply the parsing function to the 'IMAGE.name' column and create new columns
data_subset[['date', 'strain', 'time', 'cond', 'frame']] = data_subset.apply(
    lambda row: parse_image_name(row['IMAGE.name']), axis=1, result_type="expand"
)

# Use .loc to ensure the operation is done on the original DataFrame
data_subset.loc[:, ['date', 'strain', 'time', 'cond', 'frame']] = data_subset.apply(
    lambda row: parse_image_name(row['IMAGE.name']), axis=1, result_type="expand"
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_subset[['date', 'strain', 'time', 'cond', 'frame']] = data_subset.apply(


In [11]:
# Adjusting the function to rename columns from 'int#' to 'ch#'
def microbej_extract_int_statistics(data, channel):
    """
    Extracts intensity statistics from a given intensity channel and adds them to the dataframe.
    
    Parameters:
    data (DataFrame): The pandas DataFrame containing the intensity information.
    channel (str): The channel number as a string to extract statistics for.
    
    Returns:
    DataFrame: The original DataFrame with additional columns for intensity statistics.
    """
    # Extract statistics
    data[f'ch{channel}.mean'] = data[f'INTENSITY.ch{channel}'].str.extract(r'mean=(\d+\.\d+)').astype(float)
    data[f'ch{channel}.min'] = data[f'INTENSITY.ch{channel}'].str.extract(r'min=(\d+\.\d+)').astype(float)
    data[f'ch{channel}.max'] = data[f'INTENSITY.ch{channel}'].str.extract(r'max=(\d+\.\d+)').astype(float)
    data[f'ch{channel}.stdv'] = data[f'INTENSITY.ch{channel}'].str.extract(r'stdv=(\d+\.\d+)').astype(float)
    
    # Return the dataframe with the new columns
    return data

# Apply the adjusted function to each channel for the entire dataset
for i in range(1, 4):  # Assuming there are 3 channels
    data_subset = microbej_extract_int_statistics(data_subset, str(i))

# Display the first few rows to confirm the new columns
data_subset[['ch1.mean', 'ch1.min', 'ch1.max', 'ch1.stdv', 'ch2.mean', 'ch2.min', 'ch2.max', 'ch2.stdv', 'ch3.mean', 'ch3.min', 'ch3.max', 'ch3.stdv']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'ch{channel}.mean'] = data[f'INTENSITY.ch{channel}'].str.extract(r'mean=(\d+\.\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'ch{channel}.min'] = data[f'INTENSITY.ch{channel}'].str.extract(r'min=(\d+\.\d+)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Unnamed: 0,ch1.mean,ch1.min,ch1.max,ch1.stdv,ch2.mean,ch2.min,ch2.max,ch2.stdv,ch3.mean,ch3.min,ch3.max,ch3.stdv
0,14197.771,9515.0,19888.0,3105.112,18853.219,8898.0,27483.0,4243.857,4635.9297,2458.0,6031.0,728.49835
1,13454.421,10518.0,20259.0,2176.9985,9703.965,7326.0,11510.0,998.7617,8178.2104,4550.0,10558.0,1556.4503
2,15142.33,10027.0,21852.0,3474.3198,11617.922,6859.0,30524.0,3804.9343,7309.2,3403.0,10953.0,2144.8518
3,11937.072,9062.0,17243.0,2310.1047,8345.31,6825.0,9827.0,717.039,5861.5454,2871.0,7928.0,1115.7247
4,11153.036,9184.0,15266.0,1471.9005,9310.5,7451.0,11066.0,817.687,6719.452,4207.0,9343.0,1357.0682


In [12]:
# 2 Condition Violin Plot Violin Plot, strplot function

import matplotlib.pyplot as plt
import seaborn as sns


def violin_strplot_twoconditionx_hue(df, metric, xcondition1, xcondition2, huecondition): #new_df_name='new_df'
    # Create a deep copy of the DataFrame
    df_copy = df.copy()
    
    # Create a new column combining 'xcondition1' and 'xcondition2'
    df_copy['xcondition1_xcondition2'] = df_copy[xcondition1].astype(str) + "_" + df_copy[xcondition2].astype(str)
    
    plt.figure(figsize=(15, 6))
    
    # Create the violin plot
    sns.violinplot(x='xcondition1_xcondition2', y=metric, data=df_copy, inner=None, dodge=True, color='gray', alpha=0.5)
    
    # Create the strip plot
    sns.stripplot(x='xcondition1_xcondition2', y=metric, data=df_copy, hue=huecondition, dodge=True, jitter=True, marker='o', alpha=0.5)
    
    plt.title(f'{metric.capitalize()} Intensity Across {xcondition1} and {xcondition2}, Colored by {huecondition}')
    plt.legend(title=huecondition, bbox_to_anchor=(1, 1), loc=2)
    plt.tight_layout()
    plt.show()
    
    #Save the new DataFrame to the global environment
    #globals()[new_df_name] = df_copy



ModuleNotFoundError: No module named 'seaborn'