In [1]:
# Step 1: Import necessary Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Step 2: Read in the CSV file
data = pd.read_csv('hippie-ppi.csv')

In [3]:
# Step 3: Explore the data
print(data.head())
print(data.info())
print(data.describe())

      to   from
0  AL1A1  AL1A1
1   ITA7   ACHA
2   NEB1   ACTG
3   SRGN   CD44
4   GRB7  ERBB2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411430 entries, 0 to 411429
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   to      404479 non-null  object
 1   from    409754 non-null  object
dtypes: object(2)
memory usage: 6.3+ MB
None
            to    from
count   404479  409754
unique   14616   16460
top        H31      H4
freq      6200    4508


In [4]:
# Step 4: Clean the data
# Remove duplicates
data.drop_duplicates(inplace=True)
# Handle missing data
data.dropna(inplace=True)

In [None]:
def correct_gene_name(gene_name):
    """
    Corrects the gene names that are incorrectly formatted, such as "01-Dec" to "DEC1".
    """

    # If the gene name is already correct, return it
    if "-" not in gene_name:
        return gene_name
    
    # A dictionary of incorrect gene names and their correct names.
    gene_name_dict = {
        "01-Dec": "DEC1",
        "01-Mar": "MARCH1",
        "01-Sep": "SEPT1",
        "02-Sep": "SEPT2",
        "03-Sep": "SEPT3",
        "04-Sep": "SEPT4",
        "05-Sep": "SEPT5",
        "06-Sep": "SEPT6",
        "07-Sep": "SEPT7",
        "08-Sep": "SEPT8",
        "09-Sep": "SEPT9",
        "10-Sep": "SEPT10",
        "11-Sep": "SEPT11",
        "12-Sep": "SEPT12",
        "14-Sep": "SEPT14",
        "15-Sep": "SEPT15",
    }

    # Replace the incorrect gene names with the correct ones
    corrected_name = gene_name_dict[gene_name]

    return corrected_name


data[["to", "from"]] = data[["to", "from"]].applymap(correct_gene_name)
display(data.head())

In [7]:
# SEP genes are corrected.
data.loc[[3248, 3253, 3277]]

Unnamed: 0,gene_1,gene_2
3248,SEPT2,STX1A
3253,SEPT5,STX1A
3277,SEPT5,STX4


In [None]:
# Step 5: Transform the data
# Aggregation example
grouped_data = data.groupby('to').agg({'from': 'sum'})

In [None]:
# Step 6: Export the data
data.to_csv('preprocessed_data.csv', index=False)