# Lecture 15

## Readings

Relevant content for today's lecture: none!
Today we are exploring how to analyze gene expression data in Python.

In [1]:
# DO NOT MODIFY CODE BELOW THIS LINE.
import numpy as np
import pandas as pd

GE_CSV_PATH = "https://gitlab.com/oasci/courses/pitt/biosc1540-2024s/-/raw/main/large-files/ecoli-transcriptome-cell.stress.csv"
df_ge = pd.read_csv(GE_CSV_PATH)
gene_names = df_ge["gene_id"].to_numpy()
n_genes = len(df_ge)


def select_stress(df, stress):
    columns_control_ns = [
        "gene_id",
        "Parent_replicate1",
        "Parent_replicate2",
        "Parent_replicate3",
    ]
    columns_control_s = ["gene_id", "P_" + stress]
    columns_resistant_ns = ["gene_id"]
    columns_resistant_ns.extend([c for c in df.columns if stress in c and "_NS" in c])
    columns_resistant_s = ["gene_id"]
    columns_resistant_s.extend([c for c in df.columns if stress in c and "_S" in c])
    return (
        df[columns_control_ns],
        df[columns_control_s],
        df[columns_resistant_ns],
        df[columns_resistant_s],
    )

First, we print our main DataFrame to get a look into the data.

In [2]:
print(df_ge)

     gene_id  Parent_replicate1  Parent_replicate2  Parent_replicate3  \
0       aaeA           1.600048           1.580573           1.596062   
1       aaeB           2.384388           2.490061           2.490061   
2       aaeR           2.313178           2.259222           2.262170   
3       aaeX           1.675798           1.691446           1.548673   
4        aas           2.355012           2.326936           2.338066   
...      ...                ...                ...                ...   
4461    zraR           1.642945           1.712818           1.612395   
4462    zraS           1.351927           1.320579           1.411618   
4463    zupT           3.076438           3.090366           3.103091   
4464     zur           2.874605           2.888892           2.915271   
4465     zwf           2.411021           2.456952           2.431736   

        P_NaCl     P_KCl      P_Co   P_SoCar     P_Lac     P_Mal  ...  \
0     1.505947  1.571618  1.657503  1.794461  1.63

Let's use the `select_stress` function to get the relevant data for NaCl.

In [3]:
nacl_data = select_stress(df_ge, "NaCl")
print(type(nacl_data))

<class 'tuple'>


Our function gives us four DataFrames, so we have to "unpack" them into four separate variables.

In [4]:
df_control_ns, df_control_s, df_mal_ns, df_mal_s = nacl_data

In [5]:
print(df_control_ns)

     gene_id  Parent_replicate1  Parent_replicate2  Parent_replicate3
0       aaeA           1.600048           1.580573           1.596062
1       aaeB           2.384388           2.490061           2.490061
2       aaeR           2.313178           2.259222           2.262170
3       aaeX           1.675798           1.691446           1.548673
4        aas           2.355012           2.326936           2.338066
...      ...                ...                ...                ...
4461    zraR           1.642945           1.712818           1.612395
4462    zraS           1.351927           1.320579           1.411618
4463    zupT           3.076438           3.090366           3.103091
4464     zur           2.874605           2.888892           2.915271
4465     zwf           2.411021           2.456952           2.431736

[4466 rows x 4 columns]


In [6]:
print(df_control_s)

     gene_id    P_NaCl
0       aaeA  1.505947
1       aaeB  2.712741
2       aaeR  2.486852
3       aaeX  1.616997
4        aas  2.187426
...      ...       ...
4461    zraR  1.703715
4462    zraS  2.355012
4463    zupT  3.146936
4464     zur  3.637942
4465     zwf  2.207367

[4466 rows x 2 columns]


We want to just get the numbers.

In [7]:
control_ns = df_control_ns.drop(columns=["gene_id"], inplace=False).values
print(control_ns)

[[1.60004796 1.58057272 1.59606189]
 [2.38438839 2.49006068 2.49006068]
 [2.31317839 2.25922236 2.26216996]
 ...
 [3.07643761 3.09036635 3.1030913 ]
 [2.87460481 2.88889245 2.91527066]
 [2.41102055 2.45695249 2.43173646]]


In [8]:
control_ns = np.mean(control_ns, axis=1)

In [9]:
control_s = df_control_s.drop(columns=["gene_id"], inplace=False).values

print(control_s)
print(control_s.shape)

[[1.50594672]
 [2.71274138]
 [2.48685151]
 ...
 [3.14693561]
 [3.63794232]
 [2.20736673]]
(4466, 1)


In [10]:
control_s = control_s.flatten()
print(control_s)
print(control_s.shape)

[1.50594672 2.71274138 2.48685151 ... 3.14693561 3.63794232 2.20736673]
(4466,)


In [11]:
control_diff = control_s - control_ns
print(control_diff)

[-0.08628081  0.2579048   0.20866128 ...  0.05697053  0.74501968
 -0.22586977]


In [12]:
np.max(np.abs(control_diff))

3.0941512849999997

In [13]:
np.min(np.abs(control_diff))

8.881784197001252e-16

In [14]:
np.var(control_diff)

0.05929744729218926

In [15]:
sort_idxs = np.argsort(np.abs(control_diff))
print(control_diff[sort_idxs])
print(sort_idxs)

[ 8.88178420e-16  1.23297333e-04 -1.43241667e-04 ...  1.76809606e+00
  1.83709646e+00 -3.09415128e+00]
[ 329 2422 2228 ... 1893 3596 4286]


In [16]:
sort_idxs = np.argsort(np.abs(control_diff))[None:None:-1]
print(control_diff[sort_idxs])
print(sort_idxs)

[-3.09415128e+00  1.83709646e+00  1.76809606e+00 ... -1.43241667e-04
  1.23297333e-04  8.88178420e-16]
[4286 3596 1893 ... 2228 2422  329]


In [17]:
df_ge.iloc[sort_idxs]

Unnamed: 0,gene_id,Parent_replicate1,Parent_replicate2,Parent_replicate3,P_NaCl,P_KCl,P_Co,P_SoCar,P_Lac,P_Mal,...,BuOH1_S,BuOH2_S,BuOH3_S,BuOH4_S,BuOH5_S,CPC1_S,CPC2_S,CPC3_S,CPC4_S,CPC5_S
4286,yodB,4.678731,4.670634,4.714923,1.593945,2.115288,1.518068,1.549497,1.825058,2.725969,...,4.427224,1.681780,4.210672,1.698021,1.789033,2.106388,4.495029,4.275344,3.400194,4.261676
3596,ygaX,2.688027,2.632626,2.605742,4.479228,4.232069,3.158823,3.783039,2.653236,1.957980,...,2.913075,2.910853,2.866889,2.991380,2.911913,2.531869,2.313878,2.798263,2.951115,2.948324
1893,proX,3.094475,3.049540,3.004708,4.817671,4.636924,3.553367,4.271946,3.041538,2.108719,...,3.335322,3.308732,3.316546,3.409435,3.290096,2.999753,2.724896,3.233769,3.411236,3.371803
4180,ymdF,2.359584,2.373460,2.402602,4.127130,3.676813,2.297827,2.924252,3.265844,4.327708,...,2.190760,2.188255,2.069192,2.019406,2.061853,2.832869,2.796733,2.510046,2.626998,2.582298
4459,znuC,4.630019,4.630019,4.695153,2.953263,3.179441,2.867457,2.716297,3.013608,3.689238,...,4.451010,2.876743,4.454109,2.894816,2.982333,3.334461,4.630019,4.591646,4.127130,4.695153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,phnC,1.387270,1.318039,1.449391,1.385086,1.488599,1.409373,1.385086,1.447068,1.476008,...,1.423862,1.509091,1.356017,1.474472,1.538871,1.354199,1.365299,1.442657,1.306350,1.252333
3323,yedN_1,0.814256,0.797663,0.801634,0.804686,0.803897,0.784515,0.763734,0.754080,0.878126,...,0.775659,0.749113,0.753385,0.813190,0.773893,0.792075,0.752987,0.793821,0.771247,0.747404
2228,rygC,2.733641,2.686830,2.710021,2.710021,2.651550,2.782318,2.641222,2.632082,2.668627,...,2.633536,2.626998,2.653879,2.676098,2.695565,2.654396,2.608742,2.559854,2.956615,2.935965
2422,tfaS,0.768232,0.767123,0.746238,0.760655,0.763734,0.763215,0.783060,0.773893,0.773471,...,0.755846,0.734381,0.737486,0.759389,0.746887,0.771247,0.768756,0.748633,0.784992,0.757972


In [18]:
sort_idxs_ratio = np.argsort(control_s / control_ns)[None:None:-1]
print(sort_idxs_ratio)

[1281 2424 2426 ... 4459 2276 4286]


In [19]:
df_ge.iloc[sort_idxs_ratio]

Unnamed: 0,gene_id,Parent_replicate1,Parent_replicate2,Parent_replicate3,P_NaCl,P_KCl,P_Co,P_SoCar,P_Lac,P_Mal,...,BuOH1_S,BuOH2_S,BuOH3_S,BuOH4_S,BuOH5_S,CPC1_S,CPC2_S,CPC3_S,CPC4_S,CPC5_S
1281,kdpF,0.957432,1.051754,0.799222,2.456309,0.994012,1.776002,0.861017,1.259539,1.589559,...,0.806693,0.761083,0.775282,0.928758,0.782602,1.203230,0.794336,0.812139,1.168332,1.187458
2424,thiC,1.374374,1.461510,1.435003,2.790718,2.140124,1.647559,1.759285,1.525931,1.281743,...,2.352245,2.572021,2.406101,2.436446,2.712741,1.398773,1.338533,1.528405,1.356744,1.327078
2426,thiE,1.644415,1.617800,1.595423,3.077589,2.422097,1.877849,2.025810,1.752811,1.675798,...,2.594852,2.827982,2.634024,2.696574,2.998708,1.584882,1.527510,1.779675,1.526814,1.547157
2456,tnaB,1.164081,1.267826,1.374374,2.373460,1.477557,1.204007,1.059174,1.662314,1.020382,...,1.342623,1.381784,1.351107,1.804496,1.389750,1.112660,1.193639,1.293827,1.440254,1.476008
3597,ygaY,1.704639,1.681039,1.582408,3.063054,2.968620,2.037092,2.453727,2.152173,2.237927,...,1.959806,1.970070,1.885148,1.973783,1.930201,1.763589,1.618635,1.672586,1.813288,1.791640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,xylB,1.950199,1.970070,2.024819,1.296491,1.328608,1.655673,1.278955,2.822272,2.414993,...,1.421376,1.659015,1.696132,1.630339,1.711218,2.895786,2.481158,2.087654,1.878748,1.927464
1773,pfkB,2.649123,2.609813,2.665436,1.693009,1.869521,2.666011,2.039641,2.335915,2.233470,...,2.499898,2.457583,2.518857,2.403200,2.499228,2.395470,2.550223,2.532993,2.803962,2.824413
4459,znuC,4.630019,4.630019,4.695153,2.953263,3.179441,2.867457,2.716297,3.013608,3.689238,...,4.451010,2.876743,4.454109,2.894816,2.982333,3.334461,4.630019,4.591646,4.127130,4.695153
2276,serU,3.022674,2.939124,3.130691,1.558518,1.646716,1.690550,1.250627,1.581476,1.786974,...,2.841486,1.429495,2.652080,1.624323,1.693009,1.691446,2.892283,2.782318,2.313878,2.940866
