In [1]:
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression

In [2]:
# Basic stats about the size of files
# They all have 69742 rows
# They all have the same number of columns per row as the number of columns in the header
# Numbers of columns are 2142 for train_cat.csv, 1158 for train_date.csv, 970 for train_numeric.csv.

def get_basic_stats(filename):
    count = 0
    with open('bosch_small_data/{}'.format(filename)) as fp:
        line = fp.readline()
        num_columns = len(line.split(','))
        row_lengths = []
        while line:
            line = fp.readline()
            if line:
                count = count+1
                row_length = len(line.split(','))
                if (row_length != num_columns):
                    print(count)
                row_lengths.append(row_length)
    return [count, num_columns, np.unique(row_lengths)]

In [3]:
# Other stuff that has been verified

# All parts are ordered by ID in each training data set, but not in test.

In [4]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [83]:
# Get a count of station tranfers throughout the data set.
# Following, assume the order of stations for a given part is based on the order of the marking in the date field.
# When there are different times for measurements at a station, the actual time is assumed to be the first one.

# An object indicating the number of parts that go from station X to Y for each X and Y.
station_map = {}
# An object of the form {LxSy: {LzSw:[1,2,3,...]}}, indicating parts for which one station is succssor of another.
successors = {}
# List of all stations by part ID
station_lists = {}

with open('bosch_small_data/train_date.csv') as fp:
    line = fp.readline() # The line as a single text string
    header = line.split(',') # Array of each of the columns
    line_count = 0 # Count the total number of lines read, not including the header line.
    while(line): # while(line) to read through them all
        line = fp.readline().strip() # Single text string
        dates = line.split(',') # Get all values. Most are dates, except first is the ID and last is response.
        line_count += 1
        d = {} # An object containing all stations that the part visits and the (earliest) time of the visit.
        #if (line_count % 1000 == 0):
        #    print(line_count)
        if line:
            for i in range(1,len(header)-1):
                blocks = header[i].split('_')
                header_stem = blocks[0]+blocks[1] # Of the form L[X]S[Y], which is now stations will be identified.
                # Make sure we get the minimum valid date.
                if is_number(dates[i]):
                    if header_stem not in d:
                        d[header_stem] = float(dates[i])
                    else:
                        d[header_stem] = min(float(dates[i]), d[header_stem])
            stations = list(d.keys()) # The same values as header_stem, as above.
            sorted(stations, key=lambda l:d[l]) # Sort by date. In case of tie, keep initial order.
            
            # Convert stations list to the map for the Sankey diagram
            if (len(stations) > 0):
                # Start stations
                if ("Start "+stations[0]) not in station_map:
                    station_map["Start "+stations[0]] = 1
                else:
                    station_map["Start "+stations[0]] += 1
                if "Start" not in successors:
                    successors["Start"] = {}
                if stations[0] not in successors["Start"]:
                    successors["Start"][stations[0]] = []
                successors["Start"][stations[0]].append(dates[0]) # Append ID
                    
                # Intermediate stations
                for i in range(len(stations)-1):
                    if (stations[i]+" "+stations[i+1]) not in station_map:
                        station_map[stations[i]+" "+stations[i+1]] = 1
                    else:
                        station_map[stations[i]+" "+stations[i+1]] += 1
                    if stations[i] not in successors:
                        successors[stations[i]] = {}
                    if stations[i+1] not in successors[stations[i]]:
                        successors[stations[i]][stations[i+1]] = []
                    successors[stations[i]][stations[i+1]].append(dates[0]) # Append ID
                    
                # Final station, which 'Good' or 'Bad' endpoint based on response.
                end_mapper = {"0.0":"Good", "1.0":"Bad"}[dates[-1]]
                if (stations[-1]+" "+end_mapper) not in station_map:
                    station_map[stations[-1]+" "+end_mapper] = 1
                else:
                    station_map[stations[-1]+" "+end_mapper] += 1
                if stations[-1] not in successors:
                    successors[stations[-1]] = {}
                if end_mapper not in successors[stations[-1]]:
                    successors[stations[-1]][end_mapper] = []
                successors[stations[-1]][end_mapper].append(dates[0]) # Append ID
                
                # Store the full list of stations
                station_lists[dates[0]] = ["Start"]+stations+[end_mapper]

                    
filehandler = open("station_map.obj", 'wb')
pickle.dump(station_map, filehandler)
filehandler.close()

filehandler = open("successors.obj", 'wb')
pickle.dump(successors, filehandler)
filehandler.close()

filehandler = open("station_lists.obj", 'wb')
pickle.dump(station_lists, filehandler)
filehandler.close()





79926
162193
289593
339074
386604
387364
430156
475302
578159
598386
602480
605358
632286
635586


KeyboardInterrupt: 

In [5]:
# Uncomment and run this to load the station map from file rather than recreate it
with open("station_map.obj", "rb") as input_file:
    station_map = pickle.load(input_file)
with open("successors.obj", "rb") as input_file:
    successors = pickle.load(input_file)
with open("station_lists.obj", "rb") as input_file:
    station_lists = pickle.load(input_file)
# From the list of successors, just get the number of successors from one station to another.
# Same data as in station_map, but organized more explicity by starting station.
successor_lengths = {}
for key in successors:
    successor_lengths[key] = {}
    for key2 in successors[key]:
        successor_lengths[key][key2] = len(successors[key][key2])

In [6]:
# Convert station map to a destination dictionary for each station
transfer_dict = {}
for key in station_map:
    [start, end] = key.split(" ")
    if start not in transfer_dict:
        transfer_dict[start] = {}
    transfer_dict[start][end] = station_map[key]

In [7]:
successor_lengths["L0S14"]

{'L0S17': 3656, 'L0S16': 3511, 'L0S15': 2}

In [8]:
def get_training_data(station):
    station_blocks = station.split('S')
    station_alt = station_blocks[0]+"_S"+station_blocks[1]
    with open('bosch_small_data/train_date.csv') as fp_date:
        with open('bosch_small_data/train_cat.csv') as fp_cat:
            with open('bosch_small_data/train_numeric.csv') as fp_numeric:

                # Get the column numbers of interest
                date_headers = fp_date.readline().strip().split(',')
                cat_headers = fp_cat.readline().strip().split(',')
                numeric_headers = fp_numeric.readline().strip().split(',')
                date_columns = []
                cat_columns = []
                numeric_columns = []
                for i in range(len(date_headers)):
                    if station_alt in date_headers[i]:
                        date_columns.append(i)
                for i in range(len(cat_headers)):
                    if station_alt in cat_headers[i]:
                        cat_columns.append(i)
                for i in range(len(numeric_headers)):
                    if station_alt in numeric_headers[i]:
                        numeric_columns.append(i)

                # Set up data object
                # station_cat, station_numeric, and station_response will be the columns for the given station.
                station_cat = {}
                station_numeric = {}
                station_response = []
                for i in range(len(cat_columns)):
                    station_cat[cat_headers[cat_columns[i]]] = []
                for i in range(len(numeric_columns)):
                    station_numeric[numeric_headers[numeric_columns[i]]] = []

                # Read line by line and fill in data
                line_date = 'asdf'
                line_cat = ""
                line_numeric = ""
                while line_date:
                    line_date = fp_date.readline()
                    line_cat = fp_cat.readline()
                    line_numeric = fp_numeric.readline()
                    if line_date:
                        line_date_ = line_date.strip().split(',')
                        line_cat_ = line_cat.strip().split(',')
                        line_numeric_ = line_numeric.strip().split(',')
                        id_ = line_date_[0]
                        if id_ in station_lists and station in station_lists[id_]:
                            for i in range(len(cat_columns)):
                                station_cat[cat_headers[cat_columns[i]]].append(line_cat_[cat_columns[i]])
                            for i in range(len(numeric_columns)):
                                val = line_numeric_[numeric_columns[i]]
                                if not is_number(val):
                                    val = '0'
                                station_numeric[numeric_headers[numeric_columns[i]]].append(val)
                            station_response.append(station_lists[id_][station_lists[id_].index(station)+1])
                return {"cat":station_cat, "numeric":station_numeric, "response":station_response}
#training_data = get_training_data("L0S14")

In [9]:
# Next thing to do
# Use station_cat, station_numeric to predict station_response (maybe just station_numeric)

# Response in a form appropriate for scikit-learn.
def convert(x,s):
    if x == s:
        return 1
    return 0
def get_scores(station, training_data):
    successors_ = np.unique(training_data["response"])
    X = [training_data["numeric"][list(training_data["numeric"].keys())[i]] for i in range(len(training_data["numeric"].keys()))]
    X = np.array(X).transpose().astype(float)
    scores = {}
    for i in range(len(successors_)):
        if successor_lengths[station][successors_[i]] > 100:
            y = np.array(list(map(lambda x:convert(x,successors_[i]), training_data["response"])))
            reg = LinearRegression().fit(X, y)
            scores[successors_[i]] = reg.score(X,y)
    return scores


In [112]:
# Now do it for all stations
overall_results = {}
# Some bad stations have successors with missing numeric data. For now, just skip them
#bad_stations = ["Start","L0S21","L3S30","L0S1"]
bad_stations=["Start"]
for station in successor_lengths:
    if station not in bad_stations:
        num = 0 # Number of successors with over 100 parts. Need at least two of them
        for key in successor_lengths[station]:
            if successor_lengths[station][key] > 100:
                num += 1
        if num>1:
            print(station)
            training_data = get_training_data(station)
            if len(training_data["numeric"]) > 0:
                overall_results[station] = get_scores(station, training_data)

L0S13
L0S14
L0S17
L0S20
L0S21
L3S30
L3S34
L3S37
L0S1
L0S3
L0S4
L0S8
L0S11
L0S22
L0S5
L0S9
L0S16
L0S23
L2S27
L0S15
L1S24
L0S2
L0S10
L2S26
L3S41
L3S48
L3S51
L3S32
L1S25
L3S38


In [114]:
#overall_results # Scores, which are R^2 values.
# L3S30, L1S24, L1S25 are, on a brief scan, the only ones with any successors with an R^2 > 0.1

In [117]:
successor_lengths["L1S25"]

{'L2S26': 2980,
 'L3S38': 3,
 'L2S27': 1346,
 'L3S29': 474,
 'L2S28': 117,
 'L3S30': 3,
 'L3S39': 49}

In [None]:
# Presentation
# 1) Introduce the data set
# 2) Show the flow diagram of parts. Consider https://www.kaggle.com/gingerman/shopfloor-visualization
# 3) General question, do measurements influence the path that a part takes through the line?
# 4) Hypothesis: a linear model can predict the path
# 5) Describe experimental setup.
# 6) Result: no evidence