In [None]:
import rdata
import pandas as pd
import glob
import re
import dask.dataframe as dd

In [None]:
#/**
# * @Parameter RDATA_PATH(path) , DATA_NAME(str)
# * Reads Rdata and converts it to a pandas dataframe
# * @return Rdata
# */
def Convert_Rdata_to_DataFrame(RDATA_PATH , DATA_NAME):
    Rdata = rdata.parser.parse_file(RDATA_PATH)
    Rdata_Dic = rdata.conversion.convert(Rdata)
    Rdata_Dic = Rdata_Dic[DATA_NAME]
    Rdata_df = pd.DataFrame(Rdata_Dic)
    
    return Rdata_df


#* Capitalize the first letter of df column names
def Fix_columnName(DF):
    Columns_List = list(DF.columns)
    
    for i in range(len(Columns_List)):
        Columns_List[i] = Columns_List[i].capitalize()
    
    DF = DF.set_axis(Columns_List,axis='columns')
    
    return DF


#* Change data type of at(attribute) / wt (weather) data
def Convert_atdf_Dtypes(AT_DF):
    Dtype_Dic = {"Time":"int64" ,'Year':"int16" ,'Month':"int8" ,'Day':"int8","Hour":"int8",
                "Min":"int8" , 'Age':"int16",'N8':"int8",}
    
    AT_DF = AT_DF.astype(Dtype_Dic)
    
    return AT_DF


def Convert_wtdf_Dtypes(WT_DF):
    Dtype_Dic = {"Time":"int64" ,"No" : "int16" , 'Year':"int16" ,'Month':"int8" ,'Day':"int8","Hour":"int8",
                "Min":"int8" , 'Temperature' : "float32", 'Radiation' : "float32"} 
    
    WT_DF = WT_DF.astype(Dtype_Dic)
    
    return WT_DF

#/**
# * @Parameter GENE_LIST(list)
# * From the list with "gene name.radiation" or "gene name.temperature", make a list with only the gene name, temp, and rad.
# * @return PLANE_GENE_LIST(list) , Temp_Rad_list(list)
# */
def Change_GeneList_to_PlaneGeneList(GENE_LIST):
    
    Plane_Gene_List = []
    Temp_Rad_list = []
    
    for GeneName in GENE_LIST:
        
        if "radiation" in GeneName:
            Temp_Rad_list.append("radiation")
        elif "temperature" in GeneName:
            Temp_Rad_list.append("temperature")
            
        
        Plane_GeneName = (GeneName.replace('.radiation', ''))
        Plane_GeneName = (Plane_GeneName.replace('.temperature', ''))
        
        Plane_Gene_List.append(Plane_GeneName)
    
    return Plane_Gene_List , Temp_Rad_list

#/**
# * @Parameter 
# * Give the gene expression values in matrix format (108 vertical * 500 horizontal), and it will create a dataframe
# * @return 
# */
def ExpressionMatrix_to_Dataframe(EXPRESSION_MATRIX_PATH , WEATHER_DF , ALTC_ID , ENV_COUNT, DF_CORRECT_FIT):
    
    # Read Rdata , Type conversion
    df_Test_FIT = Convert_Rdata_to_DataFrame(EXPRESSION_MATRIX_PATH ,"prediction.results")
    df_Test_FIT = df_Test_FIT.astype('float64')

    # Assign column names, convert to vertical holding
    df_Test_FIT.insert(0, 'Time', list(WEATHER_DF["Time"]))
    df_Test_FIT = pd.melt(df_Test_FIT, id_vars='Time',var_name="Gene_withLabel",value_name="FIT_Test_Value")
    
    # Create new columns for ALTC and rep
    # ALTC(Artifical light and temperature conditon) : Simulated Condition Name
    # EnvCount : Sample size for simulation sampling conditions
    # rep(replicate) : Not used in this study.
    df_Test_FIT["ALTC"] = ALTC_ID
    df_Test_FIT["EnvCount"] = ENV_COUNT
    
    # Create a column with GeneGene_withLabel as GeneName only
    Plane_GeneName_List , Temp_Rad_list = Change_GeneList_to_PlaneGeneList(list(df_Test_FIT["Gene_withLabel"]))
    df_Test_FIT["Gene"] = Plane_GeneName_List
    df_Test_FIT = df_Test_FIT.drop('Gene_withLabel', axis=1)
    
    # Add information on weather conditions
    df_Test_FIT = pd.merge(df_Test_FIT , WEATHER_DF,on = "Time" , how = "inner")
    
    # Reference FIT model and merge
    df_all = pd.merge(df_Test_FIT , DF_CORRECT_FIT,on = ['Time','Gene', 'Year', 'Month','Day', 'Hour', 'Min','Age', 'N8', 'Temperature', 'Radiation'],how="inner")
    
    return df_all

#/**
# * @Parameter DATAFRAME(data.frame) ["Date" , "Sunrise" , "Sunset"]
# * Decompose Tsukuba sunrise/set time into YMDH　['Date', 'Time', 'DayOrNight', 'Year', 'Month', 'Day', 'Hour', 'Min']
# * @return dataframe
# */

def ChangeDataStarcture_DayNight(DATAFRAME):
    
    df_DayNight_2dList = DATAFRAME.values.tolist()

    df_DayNight_List = []

    for SingleList in df_DayNight_2dList:
        
        Date = SingleList[0]
        DayTime = SingleList[1]
        NightTime = SingleList[2]
        
        df_DayNight_List.append([Date,DayTime,"DayStart"])
        df_DayNight_List.append([Date,NightTime,"DayFinish"])

    df_DayNight = pd.DataFrame(df_DayNight_List,columns =["Date","Time",'DayOrNight'])
    
    df_DayNight["Date"] = pd.to_datetime(df_DayNight["Date"])
    df_DayNight["Year"] = pd.to_datetime(df_DayNight["Date"]).dt.year  
    df_DayNight["Month"] = pd.to_datetime(df_DayNight["Date"]).dt.month 
    df_DayNight["Day"] = pd.to_datetime(df_DayNight["Date"]).dt.day 
    df_DayNight["Hour"] = pd.to_datetime(df_DayNight["Time"]).dt.hour
    df_DayNight["Min"] = pd.to_datetime(df_DayNight["Time"]).dt.minute
    df_DayNight = df_DayNight.drop_duplicates()
    
    return df_DayNight


#/**
# * @Parameter DN_DF(pandas.data.frame)
# * Returns a two-dimensional array summarizing sunrise and sunset times by day.
# * @return 2dlis [ [[2015,5,1,5,8,18,42]] ,[Year,M,D,DH,DM,NH,ND ]]
# */
def Make_DayStar_Finish_2Dlis(DN_DF):
    DN_YearSet = set(DN_DF["Year"])

    Day_Night_2Dlis = []

    for Year in DN_YearSet:
        tmp_df = DN_DF[DN_DF["Year"] == Year]
    
        MonthSet = set(tmp_df["Month"])
    
        for Month in MonthSet:
            tmp_df_YM = tmp_df[tmp_df["Month"] == Month]
            DaySet = set(tmp_df_YM["Day"])
        
            for Day in DaySet:
                tmp_df_YMD = tmp_df_YM[tmp_df_YM["Day"] == Day]
            
                DayTime = tmp_df_YMD[tmp_df_YMD["DayOrNight"] == "DayStart"]
                NightTime = tmp_df_YMD[tmp_df_YMD["DayOrNight"] == "DayFinish"]
            
            
                DH = int(DayTime["Hour"])
                DM = int(DayTime["Min"])
            
                NH = int(NightTime["Hour"])
                NM = int(NightTime["Min"])
            
                tmplist = [Year,Month,Day]
            
                Day_Night_2Dlis.append([Year,Month,Day,DH,DM,NH,NM])
        
    return Day_Night_2Dlis


# x: column , DH: Hour of sunrise , DM: Min of sunrise , NH: Hour of sunset , NM: Min of sunset
# DH = 5 , DM = 8 , NH = 16 , NM = 42
def Assign_DayorNight(x,DH,DM,NH,NM):
    
    # Is the time in between sunsets?
    if (x.Hour >= DH) and (x.Hour <= NH):
        
        # Conditional processing when the time of sunset is applicable
        # When 5:08 is sunrise, if Hour is 5 and Min is less than 8, it is night
        if (x.Hour == DH):
            if (x.Min < DM):
                return "Night"
            else:
                return "Day"
        
        elif (x.Hour == NH):
            if (x.Min < NM):
                return "Day"
        
        # Sunset at 16:42, Day when Hour is 16 and Min is less than 42
        if (x.Hour == NH):
            if (x.Min < NM):
                return "Day"
            else:
                return "Night"
            
        return "Day"        
        
    else:
        return "Night"

In [None]:
# Time series files of 108 evaluation weather data points
EvaluationWeatherData_Path = r"data\EvaluationWeatherData"

# File of evaluation weather data for the entire period
All_EvaluationWeatherData_Path = r"data\All_EvaluationWeatherData_1minInterval"

# 108 points of gene expression output when the evaluation weather data is input into the reference FIT model.
Reference_FIT_GeneExpression_Path = r"ReferenceFIT_Geneexpression"

In [None]:
SimulationSamplingConditon_Path = r"SimulationSamplingConditon"

# Tsukuba's Daylength info path ["Date","Sunrise","Sunset"]
df_Tsukuba_Weather_DayNight_Path = r"Tsukuba_Weather_DayNight.csv"

In [None]:
# Read attribute data(at), rename column (first capital letter) , Optimize data type
at_Varidation_df = Convert_Rdata_to_DataFrame(EvaluationWeatherData_Path ,"at")
at_Varidation_df = Fix_columnName(at_Varidation_df)
at_Varidation_df = Convert_atdf_Dtypes(at_Varidation_df)

# Read weather data(wt)
wt_df = Convert_Rdata_to_DataFrame(All_EvaluationWeatherData_Path , "weather")
wt_df = Fix_columnName(wt_df)
wt_df = Convert_wtdf_Dtypes(wt_df)
wt_df = wt_df[['Time','Year', 'Month', 'Day', 'Hour', 'Min','Temperature','Radiation']]

# Merge wt data and at data
Weather_df = pd.merge(at_Varidation_df , wt_df , on = ["Time","Year","Month","Day","Hour","Min"],how="inner")

In [None]:
# Read TestFIT_Expression in SimulationSamplingConditon, unpivot and assign at data, and merge with reference_fit.
SimSampCon_TestFIT_Path_List = glob.glob(SimulationSamplingConditon_Path + "//**//TestFIT*",recursive=True)

SimSampCon_Result_List = []
for FilePath in SimSampCon_TestFIT_Path_List:

    Analysis_Condition = re.findall(r"(ALTC_(\d{1,2}(?:Day|Night)?)_\d{3})", FilePath)
    ALTC = Analysis_Condition[0][0]
    EnvCount = Analysis_Condition[0][1]

    df_all = ExpressionMatrix_to_Dataframe(FilePath , Weather_df, ALTC , EnvCount , df_Reference_FIT)
    
    SimSampCon_Result_List.append(df_all)

# Merge the data
df_all_raw = pd.concat(SimSampCon_Result_List)
df_all_filter = df_all_raw.copy()

# Replacement of outliers in gene expression levels
df_all_filter['FIT_Test_Value'] = df_all_filter['FIT_Test_Value'].apply(lambda x: 20 if x >= 20 else (0 if x < 0 else x))

# Create "Date" column
df_all_raw["Date"] = pd.to_datetime(df_all_raw[["Year", "Month", "Day", "Hour"]])
df_all_filter["Date"] = pd.to_datetime(df_all_filter[["Year", "Month", "Day", "Hour"]])

Allocate day and night

In [None]:
# Importing sunset/sunrise data from weather data (Tsukuba prefecture)
df_Tsukuba_Weather_DayNight = dd.read_csv(df_Tsukuba_Weather_DayNight_Path,assume_missing=True).compute()

# Converts data structure of day length data to ["Date", "Time", "DayorNight"].
df_DN = ChangeDataStarcture_DayNight(df_Tsukuba_Weather_DayNight)

# To reduce calculation time, create a df with only the date extracted from this data.
df_FIT_Date = df_all_filter[['Year', 'Month', 'Day', 'Hour', 'Min']]
df_FIT_Date["Date"] = df_FIT_Date['Year'].astype(str) + df_FIT_Date['Month'].astype(str)+ df_FIT_Date['Day'].astype(str) + df_FIT_Date['Hour'].astype(str)+ df_FIT_Date['Min'].astype(str)
df_FIT_Date = df_FIT_Date.drop_duplicates(subset=['Date'])

In [None]:
# Data frames into a two-dimensional array [Year,M,D,DH,DM,NH,ND].
Day_Night_2Dlis = Make_DayStar_Finish_2Dlis(df_DN)

df_all_list = []
for SingleList in Day_Night_2Dlis:

    Year = SingleList[0]
    Month = SingleList[1]
    Day = SingleList[2]
    DH = SingleList[3]
    DM = SingleList[4]
    NH = SingleList[5]
    ND = SingleList[6]
    
    df_select = df_FIT_Date[(df_FIT_Date['Year'] == Year) & (df_FIT_Date['Month'] == Month) & (df_FIT_Date['Day'] == Day)]
    
    if len(df_select) == 0:
        pass
    else:
        df_select['DN'] = df_select.apply(lambda x:Assign_DayorNight(x,DH,DM,NH,ND),axis=1)
        
        DN_List = list(df_select["DN"])
        DayCount = int(DN_List.count('Day'))
        df_select['DayLength(Hour)'] = DayCount
        
        df_all_list.append(df_select)

df_all_wt = pd.concat(df_all_list,axis=0)
df_all_wt = df_all_wt[['Year', 'Month', 'Day', 'Hour', 'Min','DN', 'DayLength(Hour)']]

output the result

In [None]:
df_all_raw = pd.merge(df_all_raw , df_all_wt , how = "inner" , on = ['Year', 'Month', 'Day', 'Hour', 'Min'])
df_all_filter = pd.merge(df_all_filter , df_all_wt , how = "inner" , on = ['Year', 'Month', 'Day', 'Hour', 'Min'])

In [None]:
df_all_raw.to_csv("df_all_fit_raw.csv" , index = False)
df_all_filter.to_csv("df_all_fit_filterd.csv", index = False)