In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
df = pd.read_csv("RawDataRound2_Sheet1.csv")
df.head()

Unnamed: 0,Mutation,Name,cucurbitadienol,L_1,L_12,L_23
0,WT,890,0,14152832,3048328,9230356
1,CSA_F113L_E286N_V351A_T352A_LC,890-T1,8015006,1677381,124711,0
2,CSA_F112L_E286A_I459T_LC,890-T2,6705038,14121996,1667221,0
3,CSA_F113L_L211F_E286A_V351A_T352A_LC,890-T3,6618913,245783,0,0
4,CSA_F112V_G114H_E286L_V351A_T352A_LC,890-T4,6953896,632471,0,0


In [3]:
timepoints = [1,12,23]

y = df[['L_{}'.format(t) for t in timepoints]]

print(type(y))

In [4]:
def regression_apply(df, timepoints, name="L", weighted=False):
    """
    :py:meth:`pandas.DataFrame.apply` apply function for calculating
    enrichment using linear regression. If *weighted* is ``True`` perform
    weighted least squares; else perform ordinary least squares.

    Weights for weighted least squares are included in *row*.

    Returns a :py:class:`pandas.Series` containing regression coefficients,
    residuals, and statistics.
    """
    # retrieve log ratios from the row
    y = df[[name+'_{}'.format(t) for t in timepoints]]
    print(y.shape,type(y),np.asarray(y),y)
    #y = [14152832 , 3048328 , 9230356]
    # re-scale the x's to fall within [0, 1]
    xvalues = [x / float(max(timepoints)) for x in timepoints]

    # perform the fit
    X = sm.add_constant(xvalues)  # fit intercept
    
    if weighted:
        W = df[['W_{}'.format(t) for t in timepoints]]
        fit = sm.WLS(y, X, weights=W).fit()
    else:
        model = sm.OLS(y.astype(float), X)
        results = model.fit()
        
    # re-format as a data frame row
    values = np.concatenate([results.params, [results.bse[0], results.tvalues[0],results.pvalues[0]], results.resid])
    index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + ['e_{}'.format(t) for t in timepoints]
    return pd.Series(data=values, index=index)


In [5]:
newtest = df.apply(regression_apply, args=[timepoints, 'L',False], axis='columns')

(3,) <class 'pandas.core.series.Series'> [14152832 3048328 9230356] L_1     14152832
L_12     3048328
L_23     9230356
Name: 0, dtype: object
(3,) <class 'pandas.core.series.Series'> [14152832 3048328 9230356] L_1     14152832
L_12     3048328
L_23     9230356
Name: 0, dtype: object
(3,) <class 'pandas.core.series.Series'> [1677381 124711 0] L_1     1677381
L_12     124711
L_23          0
Name: 1, dtype: object
(3,) <class 'pandas.core.series.Series'> [14121996 1667221 0] L_1     14121996
L_12     1667221
L_23           0
Name: 2, dtype: object
(3,) <class 'pandas.core.series.Series'> [245783 0 0] L_1     245783
L_12         0
L_23         0
Name: 3, dtype: object
(3,) <class 'pandas.core.series.Series'> [632471 0 0] L_1     632471
L_12         0
L_23         0
Name: 4, dtype: object
(3,) <class 'pandas.core.series.Series'> [726140 0 0] L_1     726140
L_12         0
L_23         0
Name: 5, dtype: object
(3,) <class 'pandas.core.series.Series'> [3480176 0 0] L_1     3480176
L_12        

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


#for i in df:
#    regression_apply(i, timepoints,name='L')
newtest = regression_apply(df, timepoints,name='L')

print(newtest)

In [6]:
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)
X = sm.add_constant(X)

In [7]:
y = np.dot(X, beta) + e
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.833e+06
Date:                Thu, 25 Apr 2019   Prob (F-statistic):          3.73e-243
Time:                        09:58:25   Log-Likelihood:                -137.32
No. Observations:                 100   AIC:                             280.6
Df Residuals:                      97   BIC:                             288.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8290      0.285      2.906      0.0