In [0]:
%pip install mlflow
%pip install dlt
%pip install databricks-automl-runtime
%pip install holidays
%pip install xgboost==1.5.0
%pip install sklearn
%pip install numpy
%pip install cloudpickle
%pip install autocorrect
%pip install better_profanity
%pip install geopy

#Sentiment Analytics On Delta Live Tables using ML
1. **Application                :** Social Media Analytics <br/>
2. **Usecase               :** Performing SQL analytics and Machine Learning algorithms on Twitter data coming incrementally from data lake.<br/>
3. **Notebook Summary      :** This notebook is a part of social media analytics application which creates `Delta Live Tables`.<br/>
4. **Notebook Description  :** Creates `Bronze, Silver and Gold` Delta Live tables to manage raw data, filter raw data and curated data respectively. This notebook also performs `ML operation` on Silver data to analyse sentiment analytics on Twitter Messages.


[00_Initial_Setup](https://adb-1026867335382690.10.azuredatabricks.net/?o=1026867335382690#notebook/2296268478777632/command/916539995350847)<br/>
[03_Sentiment_Analytics_On_Delta_Live_Tables](https://adb-1026867335382690.10.azuredatabricks.net/?o=1026867335382690#notebook/2296268478777530/command/916539995350918)<br/>
[04_SQL_Analytics_On_Delta_Live_Tables](https://adb-1026867335382690.10.azuredatabricks.net/?o=1026867335382690#notebook/2296268478777426/command/916539995350920)

In [0]:
import dlt
import mlflow
from pyspark.sql.functions import struct
from pyspark.sql.functions import col
from pyspark.sql.types import DateType
from pyspark.sql.types import *
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
import re
import itertools
from autocorrect import Speller
spell = Speller(lang='en')
from better_profanity import profanity
import numpy as np
from delta.tables import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType, DecimalType

### Data Clean Up

The most common problem data engineers face is having to address dirty data. Tweets are notoriously hard to parse, but we've done our best and developed a user-defined function (UDF) that takes in a Tweet and performs some cleanup to ensure our Tweets conform to a common format for analysis. By making this part of the Delta Live Tables pipeline, we ensure this UDF can be run in a distributed manner on all of our data, accelerating our ETL process at scale considerably.

In [0]:
# defining regex
const_regex_hyperlink = r'https?:\/\/.\S+'
const_regex_retweet = r'^RT[\s]+'
const_regex_twitter_handle = r'@[\w]*'
const_regex_word_sperator = "([A-Z][a-z]+[^A-Z]*)"
const_hashtag = r'#'

In [0]:
#dictionary consisting of the contraction and the actual value
Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
           "'d":" would","'ve":" have","'re":" are"}


def clean_twitter_text(x):
    # ignore non ascii
    tweet = x.encode('ascii', 'ignore').decode('ascii')
    # remove hyperlinks
    tweet = re.sub(const_regex_hyperlink, "", tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(const_hashtag, '', tweet) 
    # remove old style retweet text "RT"
    tweet = re.sub(const_regex_retweet, '', tweet)    
    # remove twitter handles (@user)
    tweet = re.sub(const_regex_twitter_handle, '', tweet)

    tweet = re.sub(":", '', tweet)
    tweet = tweet.strip()
    
    #separate the words
    tweet = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",tweet) if s])
    
    #One letter in a word should not be present more than twice in continuation
    tweet = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",tweet) if s])
    
    #replace the contractions
    for key,value in Apos_dict.items():
        if key in tweet:
            tweet=tweet.replace(key,value)
    
    return tweet

#creating UDF 
clean_twitter_text_udf = udf(clean_twitter_text)

### Incorporating our Sentiment Analysis ML Model

One goal we want to achieve from our pipeline is applying our newly created sentiment analysis ML model to the Tweets we're processing. With DLT this is as simple as declaring a UDF pointing to our model stored in our MLFlow registry. We can then leverage that UDF in our pipeline, passing in each of our Tweets as input and getting a sentiment score as an output. And extending this to streaming scenarios for real-time inference can be done just a couple of lines of code.

In [0]:
const_model_run_id = spark.sql("select * from mlview").collect()[0].value
model_name = "model"
model_uri = "runs:/{run_id}/{model_name}".format(run_id=const_model_run_id, model_name=model_name)
loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri, result_type='string')

###Twitter input data schema

### Delta Live Table Setup

As a data engineer, we can use DLT pipelines to curate our raw Twitter data into useful data assets through filtering, augmentation, and other data processing techniques.

You can define a DLT pipeline in either Python or SQL. A DLT pipeline is declarative - you name and describe the tables you want to create, and then define them using familiar DataFrame syntax or SQL object DML.

In this pipeline, we'll move our Twitter data from our bronze to silver layer, filtering for specific hashtags we're interested in analyzing and applying our "clean up" function; and then enhancing it further by applying our sentiment ML model and creating some aggregated tables of hashtag counts by location for convenient consumption in tools like Power BI.

Once we've defined our pipeline in a notebook, we can configure it to run on a scheduled basis, continuously, or when triggered by the arrival of new data in our lakehouse.

In [0]:
#Twitter Schema

twitterSchema = StructType([    
    StructField("time",TimestampType(),True),
    StructField("hashtag",StringType(),True),
    StructField("tweet",StringType(),True),
    StructField("city",StringType(),True),    
    StructField("username",StringType(),True),
    StructField("retweetcount",IntegerType(),True),
    StructField("favouritecount",IntegerType(),True),
    StructField("sentiment",StringType(),True),
    StructField("sentimentscore",DecimalType(),True),
    StructField("isretweet",IntegerType(),True),
    StructField("hourofday",StringType(),True),
    StructField("language",StringType(),True) 
    ])

In [0]:
const_staging_path = "/mnt/data-source/TwitterDataJsonSource/"
# Bronze Table Setup
@dlt.table(
    comment="Raw data",
    table_properties={
    "quality": "bronze"
    }    
)
@dlt.expect_or_drop("valid_city", "City IS NOT NULL")
def bronze_twitter_historical_data():
        return (
    spark.readStream.format("cloudFiles")
      .option("cloudFiles.format", "json")
      .schema(twitterSchema)
      .load(const_staging_path)
  )


# Silver Table Setup
@dlt.create_table(
  comment="Preparing",  
  table_properties={
    "quality": "silver"
  }    
)
def silver_twitter_historical_data():
  return dlt.readStream("bronze_twitter_historical_data").filter(col("tweet").contains("#fashion") | col("tweet").contains("#beach")| col("tweet").contains("#entertainment")| col("tweet").contains("#gogreen")| col("tweet").contains("#sustainablefashion")| col("tweet").contains("#futuretech")).withColumn("tweet", clean_twitter_text_udf('tweet')).withColumn("Date",to_date("time"))



# Gold Table Setup
@dlt.create_table(
  comment="Predicting", 
  partition_cols = ["Date"],
  table_properties={
    "quality": "gold"
  }
)
def gold_twitter_historical_data():
  return dlt.read("silver_twitter_historical_data").withColumn('MLSentiment', loaded_model('tweet'))



# Gold (Hashtag) Table Setup
@dlt.create_table(
  comment="Aggregating",  
  table_properties={
    "quality": "gold"
  }
)
def gold_twitter_historical_hashtag_data():
  return dlt.read("silver_twitter_historical_data").groupBy("hashtag").count()



# Gold (Retweet) Table Setup
@dlt.create_table(
  comment="Aggregating",  
  table_properties={
    "quality": "gold"
  }
)
def gold_twitter_historical_retweetcount_data():
  return dlt.read("silver_twitter_historical_data").selectExpr("sum(cast(retweetcount as int)) RetweetCount")



# Gold (City vs Hashtag) Table Setup
@dlt.create_table(
  comment="Aggregating",  
  table_properties={
    "quality": "gold"
  }
)
def gold_twitter_historical_city_hashtagcount_data():
  return dlt.read("silver_twitter_historical_data").groupBy("city", "hashtag").count()
