Update

nature1995 · Dec 6, 2018 · 80f63b1 · 80f63b1
1 parent 93a79c2
commit 80f63b1
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Flight-Delay-Prediction
 
-This repo is 2018 Fall EE608 Final Project: Flight Delay Prediction
+This repo is 2018 Fall EE608 Final Project: The Prediction of Flight Delays Using Regression Method
 
 ### Team Member: 
 
@@ -9,8 +9,7 @@ Ziran Gong zgong5@stevens.edu Yuqing Luo yluo27@stevens.edu Bowen Li bli50@steve
 ### Dataset
 
 Due to the size, I upload all the dataset to google drive.  
-In this project, we use part of data which is the first month.  
-We use the first 3 week to predick the last week in this month.
+In this project, we use data first month which using the first 3 week to predict the last week.
 
 https://drive.google.com/drive/folders/1LDDwiQW-74P5NFDTEpCAfETxSIBNu7cC?usp=sharing
 
@@ -74,7 +73,7 @@ After run it to the final step, you can get the flight delay prediction result f
 
 **Airlines Rank & Recommandation**
 ```
-airlines-rank.ipynb
+rank.ipynb
 ```
 Use `Shift + Enter` to run code step by step.   
 Then, you can get the results in the data processing process.  
@@ -83,13 +82,9 @@ Finally, you can get the rank histogram and the recommandation.
 ### Conclusion
 
 * Data analysis algorithms are applied to predict flight delay. 
-
 * Airlines are ranked  for recommendation purpose. 
-
 * In model 1, cross-validation can avoid bias introduced by splitting data.
-
 * In model 2, compared with linear regression, polynomial regression with ridge regression is the wining method with MSE (54.99).
-
 * Include almost all the factors to rank airline for users.
 
 

diff --git a/airlines-rank.ipynb → rank.ipynb b/airlines-rank.ipynb → rank.ipynb
@@ -253,8 +253,7 @@
    "source": [
     "unique_flights_data = {}\n",
     "for item in flights.columns:\n",
-    "    unique_flights_data[item] = flights[item].unique()\n",
-    "#unique_flights_data"
+    "    unique_flights_data[item] = flights[item].unique()"
    ]
   },
   {
@@ -395,8 +394,7 @@
     }
    ],
    "source": [
-    "# flights.CANCELLED.unique() #0,1\n",
-    "#Each airline is either cancelled or operated.\n",
+    "# Each airline is either cancelled or operated.\n",
     "rank_airlines = pd.DataFrame(flights.groupby('DESC_AIRLINE').count()['SCHEDULED_DEPARTURE'])\n",
     "rank_airlines['CANCELLED']=flights.groupby('DESC_AIRLINE').sum()['CANCELLED']\n",
     "rank_airlines['OPERATED']=rank_airlines['SCHEDULED_DEPARTURE']-rank_airlines['CANCELLED']\n",
@@ -448,7 +446,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x10e6626a0>"
+       "<matplotlib.axes._subplots.AxesSubplot at 0x10f34a6a0>"
       ]
      },
      "execution_count": 11,
@@ -647,9 +645,7 @@
    ],
    "source": [
     "flights.groupby('DESC_AIRLINE')[['ARRIVAL_DELAY','DEPARTURE_DELAY']].mean()\n",
-    "#Let's add arrival delay to our ranking module as well.\n",
     "rank_airlines['ARRIVAL_DELAY']= flights.groupby('DESC_AIRLINE')['ARRIVAL_DELAY'].mean()\n",
-    "#As our flight speed is in miles/hour,it's probably best to keep ARRIVAL DELAY in hours.\n",
     "rank_airlines['ARRIVAL_DELAY']=rank_airlines['ARRIVAL_DELAY'].apply(lambda x:x/60)\n",
     "rank_airlines.head()"
    ]
@@ -701,7 +697,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x1191d4b70>"
+       "<matplotlib.axes._subplots.AxesSubplot at 0x11d881978>"
       ]
      },
      "execution_count": 15,
@@ -721,7 +717,6 @@
    ],
    "source": [
     "rank_airlines['FLIGHTS_VOLUME'] = flights.groupby('DESC_AIRLINE')['FLIGHT_NUMBER'].count()\n",
-    "#Let's change it into ratio of flight_vol/total flight_vol\n",
     "total = rank_airlines['FLIGHTS_VOLUME'].sum()\n",
     "rank_airlines['FLIGHTS_VOLUME'] = rank_airlines['FLIGHTS_VOLUME'].apply(lambda x:(x/float(total)))\n",
     "rank_airlines['FLIGHTS_VOLUME'].plot.pie(figsize=(10,10),rot=45)"
@@ -757,7 +752,6 @@
    ],
    "source": [
     "rank_airlines[['TAXI_IN','TAXI_OUT']] = flights.groupby('DESC_AIRLINE')[['TAXI_IN','TAXI_OUT']].mean()\n",
-    "#Taxi in and out time are in minutes.Let's change them to hours.\n",
     "# rank_airlines[['TAXI_IN','TAXI_OUT']] = rank_airlines[['TAXI_IN','TAXI_OUT']].apply(lambda x, y : (x/float(60),y/float(60)))\n",
     "rank_airlines['TAXI_IN'] = rank_airlines['TAXI_IN'].apply(lambda x:(x/float(60)))\n",
     "rank_airlines['TAXI_OUT'] = rank_airlines['TAXI_OUT'].apply(lambda x:(x/float(60)))\n",
@@ -893,7 +887,6 @@
     }
    ],
    "source": [
-    "# I have scaled the data to 1-2\n",
     "for i in rank_airlines.columns:\n",
     "    rank_airlines[i] = ((rank_airlines[i]-rank_airlines[i].min())/(rank_airlines[i].max()-rank_airlines[i].min()))+1\n",
     "a = rank_airlines.RATIO_OP_SCH*rank_airlines.FLIGHT_SPEED*rank_airlines.FLIGHTS_VOLUME\n",
@@ -911,7 +904,7 @@
     {
      "data": {
       "text/plain": [
-       "<matplotlib.axes._subplots.AxesSubplot at 0x1a207990b8>"
+       "<matplotlib.axes._subplots.AxesSubplot at 0x1a2147e320>"
       ]
      },
      "execution_count": 18,