diff --git a/README.md b/README.md index 4194953..28136a0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Flight-Delay-Prediction -This repo is 2018 Fall EE608 Final Project: Flight Delay Prediction +This repo is 2018 Fall EE608 Final Project: The Prediction of Flight Delays Using Regression Method ### Team Member: @@ -9,8 +9,7 @@ Ziran Gong zgong5@stevens.edu Yuqing Luo yluo27@stevens.edu Bowen Li bli50@steve ### Dataset Due to the size, I upload all the dataset to google drive. -In this project, we use part of data which is the first month. -We use the first 3 week to predick the last week in this month. +In this project, we use data first month which using the first 3 week to predict the last week. https://drive.google.com/drive/folders/1LDDwiQW-74P5NFDTEpCAfETxSIBNu7cC?usp=sharing @@ -74,7 +73,7 @@ After run it to the final step, you can get the flight delay prediction result f **Airlines Rank & Recommandation** ``` -airlines-rank.ipynb +rank.ipynb ``` Use `Shift + Enter` to run code step by step. Then, you can get the results in the data processing process. @@ -83,13 +82,9 @@ Finally, you can get the rank histogram and the recommandation. ### Conclusion * Data analysis algorithms are applied to predict flight delay. - * Airlines are ranked for recommendation purpose. - * In model 1, cross-validation can avoid bias introduced by splitting data. - * In model 2, compared with linear regression, polynomial regression with ridge regression is the wining method with MSE (54.99). - * Include almost all the factors to rank airline for users. diff --git a/airlines-rank.ipynb b/rank.ipynb similarity index 99% rename from airlines-rank.ipynb rename to rank.ipynb index 90e7829..fb62233 100644 --- a/airlines-rank.ipynb +++ b/rank.ipynb @@ -253,8 +253,7 @@ "source": [ "unique_flights_data = {}\n", "for item in flights.columns:\n", - " unique_flights_data[item] = flights[item].unique()\n", - "#unique_flights_data" + " unique_flights_data[item] = flights[item].unique()" ] }, { @@ -395,8 +394,7 @@ } ], "source": [ - "# flights.CANCELLED.unique() #0,1\n", - "#Each airline is either cancelled or operated.\n", + "# Each airline is either cancelled or operated.\n", "rank_airlines = pd.DataFrame(flights.groupby('DESC_AIRLINE').count()['SCHEDULED_DEPARTURE'])\n", "rank_airlines['CANCELLED']=flights.groupby('DESC_AIRLINE').sum()['CANCELLED']\n", "rank_airlines['OPERATED']=rank_airlines['SCHEDULED_DEPARTURE']-rank_airlines['CANCELLED']\n", @@ -448,7 +446,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 11, @@ -647,9 +645,7 @@ ], "source": [ "flights.groupby('DESC_AIRLINE')[['ARRIVAL_DELAY','DEPARTURE_DELAY']].mean()\n", - "#Let's add arrival delay to our ranking module as well.\n", "rank_airlines['ARRIVAL_DELAY']= flights.groupby('DESC_AIRLINE')['ARRIVAL_DELAY'].mean()\n", - "#As our flight speed is in miles/hour,it's probably best to keep ARRIVAL DELAY in hours.\n", "rank_airlines['ARRIVAL_DELAY']=rank_airlines['ARRIVAL_DELAY'].apply(lambda x:x/60)\n", "rank_airlines.head()" ] @@ -701,7 +697,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 15, @@ -721,7 +717,6 @@ ], "source": [ "rank_airlines['FLIGHTS_VOLUME'] = flights.groupby('DESC_AIRLINE')['FLIGHT_NUMBER'].count()\n", - "#Let's change it into ratio of flight_vol/total flight_vol\n", "total = rank_airlines['FLIGHTS_VOLUME'].sum()\n", "rank_airlines['FLIGHTS_VOLUME'] = rank_airlines['FLIGHTS_VOLUME'].apply(lambda x:(x/float(total)))\n", "rank_airlines['FLIGHTS_VOLUME'].plot.pie(figsize=(10,10),rot=45)" @@ -757,7 +752,6 @@ ], "source": [ "rank_airlines[['TAXI_IN','TAXI_OUT']] = flights.groupby('DESC_AIRLINE')[['TAXI_IN','TAXI_OUT']].mean()\n", - "#Taxi in and out time are in minutes.Let's change them to hours.\n", "# rank_airlines[['TAXI_IN','TAXI_OUT']] = rank_airlines[['TAXI_IN','TAXI_OUT']].apply(lambda x, y : (x/float(60),y/float(60)))\n", "rank_airlines['TAXI_IN'] = rank_airlines['TAXI_IN'].apply(lambda x:(x/float(60)))\n", "rank_airlines['TAXI_OUT'] = rank_airlines['TAXI_OUT'].apply(lambda x:(x/float(60)))\n", @@ -893,7 +887,6 @@ } ], "source": [ - "# I have scaled the data to 1-2\n", "for i in rank_airlines.columns:\n", " rank_airlines[i] = ((rank_airlines[i]-rank_airlines[i].min())/(rank_airlines[i].max()-rank_airlines[i].min()))+1\n", "a = rank_airlines.RATIO_OP_SCH*rank_airlines.FLIGHT_SPEED*rank_airlines.FLIGHTS_VOLUME\n", @@ -911,7 +904,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 18,